In [5]:
from bs4 import BeautifulSoup
import requests

In [11]:
import numpy as np
import pandas as pd
import multiprocessing
from gensim import models
from gensim import corpora
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Phrases
from gensim.models.fasttext import FastText
from gensim.models.wrappers.fasttext import FastText as FT_wrapper
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from random import shuffle
import time
import re
import pylab as pl
from ipywidgets import FloatProgress
from IPython import display
import matplotlib.pyplot as plt
%matplotlib notebook

In [73]:
def get_text_from_url_and_its_children(main_url):
    print("starting to crawl main url: ", main_url)
    # check validity of main_url
    resp = url_is_valid(main_url)
    if not resp:
        print("main_url is not valid")
        return False

    print("\nstarting to crawl all its children")
    # grab all urls in this web page
    urls = [main_url]
    urls.extend(get_urls_from_url(main_url))
    urls = list(set(urls)) # remove duplicated urls
    print("\n\nthese are the children links we crawled")
    print(urls, "\n")
    # grab all texts in each urls asynchronously
    # argmumentize urls
    urls = [(url, main_url) for url in urls]
    with multiprocessing.Pool(processes=24) as pool:
        text_data = pool.starmap(get_text_from_url_with_check, urls)
    text_data = [text for text in text_data if len(text_data) > 0] # remove empty returns
    text_data = [text for text_list in text_data for text in text_list] # get list elements to str
    return text_data

def get_text_from_url_with_check(url, main_url):
    resp = url_is_valid(url)
    if not resp:
        url = main_url + url
        resp = url_is_valid(url)
        if not resp:
            print("url:", url, "invalid")
            return []
    # double check if the url is visited
    if resp.url != url: # meaning its redirected
        print('the url is redirected, try https\n')
        # try https
        url = url[:4] + 's' + url[4:]
        resp = url_is_valid(url)
        if resp:
            if resp.url == url:
                print('try succeeded')
        else:
            return []
    # check if url is the child or sibling of main_url
    if url_compare(main_url, resp.url) < 10: # to avoid http://www.
        print('\nurl:', resp.url, 'might be irrelevent to', main_url, 'quit visiting\n')
        return []
    text_data = []
    for text in get_texts_from_resp(resp):
        text_data.append(text)
    return text_data

def get_urls_from_url(main_url):
    resp = requests.get(main_url)
    soup = BeautifulSoup(resp.content, 'html.parser')
    urls = []
    links = soup.find_all('a')
    for url in links:
        try:
            url = url.attrs['href']
            if len(url) > 5:
                urls.append(url)
        except:
            pass
    return urls

def get_texts_from_resp(resp):
    # parse the web response
    soup = BeautifulSoup(resp.content, 'html.parser')
    # find and filter texts
    print("These are texts under", resp.url)
    texts = soup.find_all('p')
    print("number of items grabed are", len(texts))
    texts = [text for text in texts if len(text.text) > 100]
    print("number of items after filtering", len(texts))
    # output texts
    for text in texts:
        #print(text.text)
        yield text.text

def url_is_valid(url):
    try:
        resp = requests.get(url, timeout=10)
        assert resp.status_code == 200
        return resp
    except:
        return False

def url_compare(url_target, url_income):
    n_same_letter = 0.0
    # delete all http or https 
    if url_target[4] == 's':
        url_target = url_target[5:]
    else:
        url_target = url_target[4:]
    if url_income[4] == 's':
        url_income = url_income[5:]
    else:
        url_income = url_income[4:]
    # check similarity
    min_len = min(len(url_target), len(url_income))
    for i in range(min_len-1):
        if url_target[i] == url_income[i]:
            n_same_letter += 1
        else:
            break
    return n_same_letter

In [78]:
main_url = 'https://www.aecfafrica.org/'
len(get_text_from_url_and_its_children(main_url))

starting to crawl main url:  https://www.aecfafrica.org/
main_url is not valid


TypeError: object of type 'bool' has no len()

In [10]:
import multiprocessing

urls = [('http://www.grayghostventures.com/indexa.html',),  ('https://twitter.com/GrayGhostVT',)]
with multiprocessing.Pool(processes=2) as pool:
    results = pool.starmap(get_text_from_url_and_its_children, urls)
len(results)

These are texts under http://www.grayghostventures.com/indexa.html
number of items grabed are 10
number of items after filtering 0

finish crawling texts from main url

starting to crawl all its children


these are the children links we crawled
['investorLogin/investorLogin.aspx', 'investorLogin/jobs.aspx', 'https://www.facebook.com/GrayGhostVentures', 'http://www.linkedin.com/company/gray-ghost-ventures', 'https://twitter.com/GrayGhostVT', '/indexa.html', '/indexa.html', '/about/history.html', '/about/philosophy.html', '/about/initiatives/impactventures.html', '/about/initiatives/impactventures.html#tabSection', '/about/initiatives/indianschoolfinancecompany.html#tabSection', '/about/team.aspx', '/about/history.html', '/about/history.html', '/portfolio/portfolioImpactVentures.aspx', '/portfolio/portfolioBusinessPlanSubmission.html', '/portfolio/portfolioImpactVentures.aspx', '/portfolio/portfolioImpactVentures.aspx', '/media/events.aspx', '/media/press.aspx', '../annualreports.html',

url: http://www.grayghostventures.com/indexa.html/portfolio/portfolioBusinessPlanSubmission.html invalid
url: http://www.grayghostventures.com/indexa.html/about/history.html invalid
url: https://twitter.com/GrayGhostVT/i/moments invalid
url: http://www.grayghostventures.com/indexa.html/media/press.aspx invalid
url: https://twitter.com/GrayGhostVT/rossbaird/status/761065150799499264 invalid
url: http://www.grayghostventures.com/indexa.html/portfolio/portfolioImpactVentures.aspx invalid
url: https://twitter.com/GrayGhostVT//support.twitter.com/articles/20170514 invalid
These are texts under http://www.grayghostventures.com/contact/contact.html
number of items grabed are 12
number of items after filtering 2
These are texts under http://www.grayghostventures.com/about/history.html
number of items grabed are 12
number of items after filtering 2

url: https://us12.campaign-archive.com/?u=01baa5884cebc1d814aa95699&id=aff255ec3a might be irrelevent to main_url, quit visiting


url: https://twi

number of items after filtering 9
These are texts under https://twitter.com/GrayGhostVT?lang=fil
number of items grabed are 51
number of items after filtering 30
These are texts under https://twitter.com/GrayGhostVT?lang=no
number of items grabed are 51
number of items after filtering 28
url: https://twitter.com/GrayGhostVT/OmidyarNetwork invalid

url: https://pbs.twimg.com/profile_images/3108264190/4172b633533d480e001b3831120b1425.jpeg might be irrelevent to main_url, quit visiting

url: https://twitter.com/GrayGhostVT/daphnepit71/status/736163429875093504 invalid

url: http://www.impactassets.org/ia50_new/?platform=hootsuite might be irrelevent to main_url, quit visiting

url: https://twitter.com/GrayGhostVT/GrayGhostVT/media invalid
url: https://twitter.com/GrayGhostVT/cavalcanti_nyc/status/832113778116411392 invalid
url: https://twitter.com/GrayGhostVT/LinkedOrg invalid


Process ForkPoolWorker-5:
Traceback (most recent call last):
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 252, in _bootstrap
    self.run()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/queues.py", line 335, in get
    res = self._reader.recv_bytes()
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/yuze/anaconda3/envs/tensorflow/lib/python3.5/multiprocessing/connection.py", line 379, in _re

KeyboardInterrupt: 

NameError: name 'results' is not defined

In [10]:
main_url = 'http://www.grayghostventures.com/indexa.html'
get_text_from_url_and_its_children(main_url)

These are texts under http://www.grayghostventures.com/indexa.html
number of items grabed are 10
number of items after filtering 0

finish crawling texts from main url

starting to crawl all its children


these are the children links we crawled
['investorLogin/investorLogin.aspx', 'investorLogin/jobs.aspx', 'https://www.facebook.com/GrayGhostVentures', 'http://www.linkedin.com/company/gray-ghost-ventures', 'https://twitter.com/GrayGhostVT', '/indexa.html', '/indexa.html', '/about/history.html', '/about/philosophy.html', '/about/initiatives/impactventures.html', '/about/initiatives/impactventures.html#tabSection', '/about/initiatives/indianschoolfinancecompany.html#tabSection', '/about/team.aspx', '/about/history.html', '/about/history.html', '/portfolio/portfolioImpactVentures.aspx', '/portfolio/portfolioBusinessPlanSubmission.html', '/portfolio/portfolioImpactVentures.aspx', '/portfolio/portfolioImpactVentures.aspx', '/media/events.aspx', '/media/press.aspx', '../annualreports.html',

url: http://www.grayghostventures.com/indexa.htmlinvestorLogin/jobs.aspx invalid
url: http://www.grayghostventures.com/indexa.html/about/initiatives/impactventures.html#tabSection invalid
url: http://www.grayghostventures.com/indexa.html/indexa.html invalid
url: http://www.grayghostventures.com/indexa.html/about/philosophy.html invalid
These are texts under http://www.grayghostventures.com/about/history.html
number of items grabed are 12
number of items after filtering 2
Gray Ghost Ventures is a pioneer of the global impact investing movement and continues to be innovative in furthering its expansion. As one of the earliest private investors in microfinance, GGV seeks to eliminate poverty and strengthen communities through catalytic, early-stage investments in the developing world by focusing on enabling technology, financial services, and other products and services concentrated on enhancing the quality of life for large, underserved populations in emerging markets.
In 2005, Gray Ghos

These are texts under http://www.grayghostventures.com/investorLogin/trademarks.html
number of items grabed are 14
number of items after filtering 2
Trademarks, logos and service marks displayed on this site are registered and unregistered trademarks of WWV Holdings
        LLC its licensors or content providers, or other third parties. All of these trademarks, logos and service marks are the
        property of their respective owners. Nothing on this site shall be construed as granting, by implication, estoppel, or
        otherwise, any license or right to use any trademark, logo or service mark displayed on the site without the owner's prior
        written permission, except as otherwise described herein. WWV Holdings LLC reserves all rights not expressly granted in
        and to the site and its content. This site and all of its content, including but not limited to text, design, graphics,
        interfaces and code, and the selection and arrangement thereof, is protected as a 

url: http://www.grayghostventures.com/indexa.html/portfolio/portfolioBusinessPlanSubmission.html invalid


['This Privacy Policy (“Privacy Policy”) is designed to help you understand generally how Gray Ghost Ventures (GGV) and its affiliated entities use and disclose information about you in connection with the website on which this Privacy Policy is posted (the “Site”). Gray Ghost Ventures is the trade name for WWV Holdings, LLC and its affiliated for-profit entities. References to GGV include, where appropriate, its affiliated entities. This privacy policy has also been adopted by Gray Matters Capital, Inc. and The Rockdale Foundation (the “Foundations”). By using or accessing the Site, you expressly consent to our collection, use and disclosure of information in accordance with this Privacy Policy. This Privacy Policy is effective upon your first use of or access to the Site.',
 'Personal Information. “Personal Information” means information that personally identifies you, such as your name, phone number, mailing address or email address.\r\n        \n\r\n        Submitted Information. G

In [None]:
pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, :]