In [50]:
from bs4 import BeautifulSoup
import requests
import multiprocessing
import numpy as np
import pandas as pd
from gensim import models
from gensim import corpora
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Phrases
from gensim.models.fasttext import FastText
from gensim.models.wrappers.fasttext import FastText as FT_wrapper
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary
from fuzzywuzzy import fuzz
from nltk.tokenize import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from random import shuffle
import time
import re
import pylab as pl
from ipywidgets import FloatProgress
from IPython import display
import matplotlib.pyplot as plt
%matplotlib notebook



In [130]:
def get_urls_from_url(main_url):
    resp = requests.get(main_url)
    soup = BeautifulSoup(resp.content, 'html.parser')
    urls = []
    links = soup.find_all('a')
    for url in links:
        try:
            url = url.attrs['href']
            if len(url) > 5:
                urls.append(url)
        except:
            pass
    return urls

def get_texts_from_resp(resp):
    # parse the web response
    soup = BeautifulSoup(resp.content, 'html.parser')
    # find and filter texts
    print("These are texts under", resp.url)
    texts = soup.find_all('p')
    print("number of items grabed are", len(texts))
    texts = [text for text in texts if len(text.text) > 100]
    print("number of items after filtering", len(texts))
    # output texts
    for text in texts:
        #print(text.text)
        yield text.text

def url_is_valid(url):
    try:
        resp = requests.get(url, timeout=10)
        assert resp.status_code == 200
        return resp
    except:
        return False

def url_compare(url1, url2, thresh=70):
    """
    Based on the similarity between roots of two url, return whether these two url are smiliary or not
    """
    # extract pattern in () http(s)://()/???/???
    url1 = re.sub("(https?://)?", "", url1).split('/')[0]
    url2 = re.sub("(https?://)?", "", url2).split('/')[0]
    
    # find similarity between roots
    root_sim = fuzz.partial_ratio(url1, url2)
    
    if root_sim >= thresh:
        return True
    else:
        print(url1, " and ", url2, " may not be relevent")
        return False

def get_text_from_url_with_check(url, main_url):
    """
    The bottom function that extract text from url
    """
    # avoid url ends with .pdf
    if url.split(".")[-1] == "pdf":
        return []
    
    # check if url is valid
    resp = url_is_valid(url)
    # if the url is not valid, it is possible that it is in the form of 
    if not resp:
        if not "http" in url:
            url = main_url + url
            resp = url_is_valid(url)
            if not resp:
                print("url:", url, "invalid")
                return []
        else:
            print("url:", url, "invalid")
            return []
        
    # double check if the url is actually visited
    if resp.url != url: # meaning its redirected, which means an error happened
        # in many cases, the redirection is due to website has prefix https instead of http
        url = url[:4] + 's' + url[4:]
        resp = url_is_valid(url)
        if resp:
            if resp.url == url:
                print('try succeeded')
        else:
            return []
        
    # check if url is the child or sibling of main_url
    # sometimes, the url is directed to same irrelevent sites such as www.twitter.com etc.
    if not url_compare(main_url, resp.url):
        return []
    
    # get text from url
    text_data = []
    for text in get_texts_from_resp(resp):
        text_data.append(text)
    return text_data

def get_text_from_url_and_its_children(main_url):
    """
    Parallalize the text extraction process from given main url
    """
    # preprocess main url
    # remove space in url
    main_url = main_url.replace(" ", "")
    # force https:// prefix to the main url
    main_url = "https://" + re.sub("(https?://)?", "", main_url)
    # remove last "/" if there is one
    if main_url[-1] == "/":
        main_url = main_url[:-1]
    
    print("starting to crawl main url: ", main_url)
    
    # check validity of main_url
    resp = url_is_valid(main_url)
    if not resp:
        print("main_url: ", main_url, " is not valid")
        return False

    # grab all urls in this web page
    urls = [main_url]
    urls.extend(get_urls_from_url(main_url))
    # remove duplicated urls
    urls = list(set(urls)) 
    print("\n\nthese are the children links we crawled")
    print(urls, "\n")
    # grab all texts in each urls asynchronously
    # argmumentize urls
    urls = [(url, main_url) for url in urls]
    with multiprocessing.Pool(processes=24) as pool:
        text_data = pool.starmap(get_text_from_url_with_check, urls) 
    
    # collect output text data
    text_data = [text for text in text_data if len(text_data) > 0] # remove empty returns
    text_data = [text for text_list in text_data for text in text_list] # get list elements to str
    return " ".join(text_data)

In [None]:
database = pd.read_csv('../Examples/database.csv').iloc[:, [1, 5]]
database = database.dropna()
database['Crawled'] = np.vectorize(get_text_from_url_and_its_children)(database.iloc[:, 1])

starting to crawl main url:  https://www.aecfafrica.org


these are the children links we crawled
['http://www.parioagency.com', '/portfolio/agribusiness', '\n\n\n\n\n/about-us/who-we-are\n\n\n', '/node/326', '/portfolio/competitions', '/the-aecf-management', '\n\n\n\n\n/portfolio/aecf_gender_lens_investment\n\n\n', 'mailto:info@aecfafrica.org', '/about-us/strategic-partners', '/about-us/The-AECF-Board', 'tel:+254203675394', '/about-us/our-history', '\n\n\n\n\n/about-us/funding-partners\n\n\n', '/about-us/funding-partners', '/work-with-us', 'tel:+254703033394', '/portfolio/aecf_gender_lens_investment', '/node/299', '/pt-pt', '/portfolio/renewable-energy/REACT-EEP', 'https://www.linkedin.com/company/africa-enterprise-challenge-fund', '\n\n\n\n\n/portfolio/agribusiness\n\n\n', '/portfolio/renewable-energy', '/media-centre/news', '/portfolio/renewable_energy/react_ssa', '\n\n\n\n\n/portfolio/renewable-energy\n\n\n', '/media-centre/videos', '/node/329', '/media-centre/blog', '/node/300', '

number of items grabed are 18
number of items after filtering 5
These are texts under https://www.aecfafrica.org/portfolio/competitions
These are texts under https://www.aecfafrica.org/media-centre/videos
number of items grabed are 13
number of items grabed are 45
number of items after filtering 0
number of items after filtering 28
These are texts under https://www.aecfafrica.org/the-aecf-management
These are texts under https://www.aecfafrica.org/about-us/strategic-partners
number of items grabed are 15
number of items grabed are 6
number of items after filtering 11
number of items after filtering 3
These are texts under https://www.aecfafrica.org/node/329
number of items grabed are 3
number of items after filtering 0
These are texts under https://www.aecfafrica.org/about-us/funding-partners
number of items grabed are 19
number of items after filtering 10
These are texts under https://www.aecfafrica.org/pt-pt
number of items grabed are 8
number of items after filtering 3
These are tex

url: https://agra.org/grants/agra-2017-new-site/contact/ invalid
These are texts under https://agra.org/funding-partners/
number of items grabed are 2
number of items after filtering 2
url: https://agra.org/africas-big-philanthropy-agriculture-and-food-security-by-bbc/ invalid
starting to crawl main url:  https://www.globalinnovation.fund
main_url:  https://www.globalinnovation.fund  is not valid
starting to crawl main url:  https://acumen.org


these are the children links we crawled
['https://www.plusacumen.org/courses/social-entrepreneurship-101', 'http://acumenideas.com/risky-business-3116c40826a1', 'https://medium.com/energy-impact-series/encouraging-children-to-do-their-homework-can-be-a-tough-task-at-the-best-of-times-285ce31440f9', 'https://acumen.org/fellowships/regions/', 'https://acumen.org/anti-corruption-policy/', 'https://www.plusacumen.org/courses/introduction-human-centered-design', 'https://briteweb.com/', 'https://twitter.com/acumen', 'https://www.classy.org/checkout/

www.afdb.org  and  vimeo.com  may not be relevent
These are texts under http://idev.afdb.org/
number of items grabed are 15
number of items after filtering 5
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/countries/north-africa/morocco/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/about-us/organisational-structure/ethics-office/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/projects-and-operations/procurement/projects-procurements-services-contacts/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/afdb-oecd-joint-initiative/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/sectors/human-capital-development/ invalid
url: https://www.afdb.org/en/about-us/corporate-informatio

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/african-guarantee-fund-for-small-and-medium-sized-enterprises/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/environmental-social-assessments/environmental-and-social-management-plans-esmp/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/policy-documents/policies-on-cross-cutting-issues/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/countries/west-africa/guinea/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/rss-feeds/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/about-us/corporate-information/history/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/publications/afdb-in-10/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/sectors/environment/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/multidonor-water-partnership-program/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/projects-and-operations/project-cycle/project-appraisal/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/countries/west-africa/burkina-faso/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/publications/working-paper-series/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/projects-and-

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/congo-basin-forest-fund/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/health-in-africa-fund/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/publications/african-development-report/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/projects-and-operations/project-cycle/project-preparation/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/knowledge/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/countries/west-africa/cape-verde/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documen

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/publications/africa-competitiveness-report/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/knowledge/macro-economics-policy-forecasting-and-research/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/african-financing-partnership/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/about-us/organisational-structure/administrative-tribunal/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/knowledge/publications/agricultural-statistics-capacity-indicators-ascis/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/nepad-infrastructure-project-preparation-fa

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/publications/economic-briefs/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/knowledge/publications/africa-tourism-monitor/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/knowledge/publications/gender-poverty-and-environmental-indicators-on-african-countries/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/register/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/corporate-procurement/requests-for-proposals-rfp/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/agriculture-fast-track-aft-fund/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-developme

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/about-us/corporate-information/african-development-fund-adf/adf-country-resources-allocation/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/emergency-liquidity-facility-elf/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/evaluation-reports/country-sector-reviews-and-case-studies/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/compliance-reviews/boards-of-directors-resolutions/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/financial-information/exchange-rates/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/countries/north-africa/algeria/ invalid
url: https://www

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/news-and-events/events-calendar/advanced-search/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/news-and-events/interviews/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/topics/quality-assurance-results/safeguards-and-sustainability-series/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/initiatives-partnerships/climate-for-development-in-africa-climdev-africa-initiative/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/policy-documents/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-

url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/topics-and-sectors/sectors/water-supply-sanitation/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/news-and-events/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/about-us/organisational-structure/communication-and-external-relations/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/documents/project-operations/project-studies/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/about-us/organisational-structure/integrity-and-anti-corruption/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf/en/knowledge/macro-economics-policy-forecasting-and-research/publications/ invalid
url: https://www.afdb.org/en/about-us/corporate-information/african-developm

url: https://www.lgtvp.com/en/en/fellowship/about-the-fellowship/key-facts/ invalid
url: https://www.lgtvp.com/en/en/fellowship/ invalid
url: https://www.lgtvp.com/en/en/fellowship/about-the-fellowship/in-the-field/ invalid
url: https://www.lgtvp.com/en/en/detailnews/New-report-on-state-of-oral-healthcare-for-urban-low-income-households-in-India/ invalid
url: https://www.lgtvp.com/en/en/detailnews/Educate-Girls-Worlds-first-development-impact-bond-in-education-surpassed-target-outcomes/ invalid
These are texts under https://www.lgtvp.com/en/portfolio/portfolio-overview/
number of items grabed are 27
number of items after filtering 8
These are texts under https://www.lgtvp.com/en/fellowship/about-the-fellowship/register-and-apply/
number of items grabed are 37
number of items after filtering 13
url: https://www.lgtvp.com/en/en/important-legal-information/ invalid
url: https://www.lgtvp.com/en/en/fellowship/about-the-fellowship/register-and-apply/ invalid
url: https://www.lgtvp.com/en/en

number of items grabed are 63
number of items after filtering 41
These are texts under http://www.grofin.com/language/en/entrepreneurs/#section7_grofinclientsuccess
number of items grabed are 32
number of items after filtering 16
These are texts under https://www.grofin.com/language/en/about_grofin/
number of items grabed are 40
number of items after filtering 23
These are texts under http://www.grofin.com/language/en/business_support/#section3_financeexpertisesuccess
number of items grabed are 26
number of items after filtering 16
These are texts under https://www.grofin.com/language/en/careers/
number of items grabed are 24
number of items after filtering 13
These are texts under http://www.grofin.com/language/en/about_grofin/#section1_whoweare
number of items grabed are 40
number of items after filtering 23
These are texts under http://www.grofin.com/language/en/business_support/#section2_howweworkwithsmes
number of items grabed are 26
number of items after filtering 16
These are te

These are texts under http://idev.afdb.org/
number of items grabed are 15
number of items after filtering 5
These are texts under https://www.afdb.org/en/topics-and-sectors/initiatives-partnerships/trade-finance-program/
number of items grabed are 9
number of items after filtering 6
These are texts under https://www.afdb.org/en/knowledge/african-development-institute/strategic-cooperation/
number of items grabed are 8
number of items after filtering 8
These are texts under https://www.afdb.org/en/knowledge/publications/millennium-development-goals-mdgs-report/
number of items grabed are 16
number of items after filtering 5
These are texts under https://www.afdb.org/en/countries/west-africa/burkina-faso/
number of items grabed are 17
number of items after filtering 4
These are texts under https://www.afdb.org/en/topics-and-sectors/initiatives-partnerships/enhanced-private-sector-assistance-for-africa-epsa-initiative/
number of items grabed are 129
number of items after filtering 20
Thes

These are texts under https://www.afdb.org/en/news-and-events/projet-de-transport-urbain-dabidjan-329-millions-deuros-supplementaires-de-la-banque-africaine-de-developpement-pour-boucler-le-financement-du-projet-18603/
number of items grabed are 11
number of items after filtering 8
These are texts under https://www.afdb.org/en/topics-and-sectors/sectors/environment/
number of items grabed are 12
number of items after filtering 2
These are texts under https://www.afdb.org/en/countries/east-africa/djibouti/
number of items grabed are 13
number of items after filtering 1
These are texts under https://www.afdb.org/en/topics-and-sectors/initiatives-partnerships/african-guarantee-fund-for-small-and-medium-sized-enterprises/
number of items grabed are 27
number of items after filtering 23
These are texts under https://www.afdb.org/en/countries/west-africa/ghana/
number of items grabed are 18
number of items after filtering 3
These are texts under https://www.afdb.org/en/topics-and-sectors/ini

number of items grabed are 11
number of items after filtering 4
These are texts under https://www.afdb.org/en/documents/publications/country-brochure/
number of items grabed are 23
number of items after filtering 8
These are texts under https://www.afdb.org/en/countries/east-africa/south-sudan/
number of items grabed are 18
number of items after filtering 4
These are texts under https://www.afdb.org/en/projects-and-operations/financial-products/african-development-bank/
number of items grabed are 8
number of items after filtering 7
These are texts under https://www.afdb.org/en/topics-and-sectors/initiatives-partnerships/sustainable-energy-fund-for-africa/
number of items grabed are 19
number of items after filtering 10
These are texts under https://www.afdb.org/en/documents/project-related-procurement/
number of items grabed are 16
number of items after filtering 0
These are texts under https://www.afdb.org/en/blogs/industrialisation-and-trade-corner/
number of items grabed are 21
numb

number of items grabed are 21
number of items after filtering 4
These are texts under https://www.afdb.org/en/topics-and-sectors/initiatives-partnerships/health-in-africa-fund/
number of items grabed are 16
number of items after filtering 16
These are texts under https://www.afdb.org/en/documents/knowledge/
number of items grabed are 13
number of items after filtering 2
These are texts under https://www.afdb.org/en/projects-and-operations/procurement/resources-for-businesses/
number of items grabed are 20
number of items after filtering 13
These are texts under https://www.afdb.org/en/news-and-events/events-calendar/past/
number of items grabed are 19
number of items after filtering 10
These are texts under https://www.afdb.org/en/about-us/organisational-structure/administrative-tribunal/
number of items grabed are 16
number of items after filtering 10
These are texts under https://www.afdb.org/en/documents/publications/newsletters/
number of items grabed are 21
number of items after f

number of items after filtering 4
These are texts under https://www.afdb.org/en/documents/evaluation-reports/
number of items grabed are 19
number of items after filtering 1
These are texts under https://www.afdb.org/en/knowledge/statistics/about-us/
number of items grabed are 3
number of items after filtering 3
www.afdb.org  and  www.facebook.com  may not be relevent
These are texts under https://www.afdb.org/en/topics-and-sectors/sectors/transport/
These are texts under https://www.afdb.org/en/documents/publications/africa-competitiveness-report/
number of items grabed are 9
number of items after filtering 3
number of items grabed are 2
number of items after filtering 0
These are texts under https://www.afdb.org/en/countries/southern-africa/swaziland/
number of items grabed are 15
number of items after filtering 2
These are texts under https://www.afdb.org/en/countries/east-africa/eritrea/
number of items grabed are 15
number of items after filtering 1
These are texts under https://w

number of items grabed are 23
number of items after filtering 10
These are texts under https://www.afdb.org/en/knowledge/publications/policy-briefs/
number of items grabed are 12
number of items after filtering 5
These are texts under https://www.afdb.org/en/topics-and-sectors/initiatives-partnerships/african-legal-support-facility/
number of items grabed are 8
number of items after filtering 8
These are texts under https://www.afdb.org/en/countries/east-africa/tanzania/
number of items grabed are 19
number of items after filtering 3
These are texts under https://www.afdb.org/en/register/
number of items grabed are 2
number of items after filtering 0
These are texts under https://www.afdb.org/en/knowledge/macro-economics-policy-forecasting-and-research/about-us/
number of items grabed are 7
number of items after filtering 6
These are texts under https://www.afdb.org/en/countries/southern-africa/lesotho/
number of items grabed are 15
number of items after filtering 2
url: https://www.af

number of items grabed are 4
number of items after filtering 3
These are texts under https://www.afdb.org/en/documents/evaluation-reports/country-sector-reviews-and-case-studies/
number of items grabed are 15
number of items after filtering 1
These are texts under https://www.afdb.org/en/projects-and-operations/project-cycle/loan-negotiation/
number of items grabed are 2
number of items after filtering 2
These are texts under https://www.afdb.org/en/about-us/corporate-information/financial-information/investor-resources/capital-markets/
number of items grabed are 15
number of items after filtering 9
These are texts under https://www.afdb.org/en/topics-and-sectors/sectors/infrastructure/
number of items grabed are 2
number of items after filtering 0
These are texts under https://www.afdb.org/en/topics-and-sectors/topics/food-production/
number of items grabed are 3
number of items after filtering 1
These are texts under https://www.afdb.org/en/about-us/careers/young-professionals-progra

number of items grabed are 17
number of items after filtering 3
These are texts under https://www.afdb.org/en/documents/policy-documents/financing-policies/
number of items grabed are 3
number of items after filtering 0
These are texts under https://www.afdb.org/en/countries/east-africa/comoros/
number of items grabed are 13
number of items after filtering 1
These are texts under https://www.afdb.org/en/documents/project-operations/country-dialogue-papers/
number of items grabed are 16
number of items after filtering 0
These are texts under https://www.afdb.org/en/documents/knowledge/annual-meetings-seminars/
number of items grabed are 3
number of items after filtering 0
These are texts under https://www.afdb.org/en/documents/project-related-procurement/policies-and-procedures/
number of items grabed are 20
number of items after filtering 0
These are texts under https://www.afdb.org/en/knowledge/statistics/publications/
number of items grabed are 18
number of items after filtering 16
T

number of items grabed are 25
number of items after filtering 10
These are texts under https://www.afdb.org/en/documents/strategy-documents/special-initiatives/
number of items grabed are 23
number of items after filtering 0
These are texts under https://www.afdb.org/en/countries/southern-africa/botswana/
number of items grabed are 17
number of items after filtering 3
www.afdb.org  and  www.youtube.com  may not be relevent
These are texts under https://www.afdb.org/en/knowledge/african-development-institute/african-development-institute-contacts/
number of items grabed are 1
number of items after filtering 0
These are texts under https://www.afdb.org/en/countries/southern-africa/mozambique/
number of items grabed are 19
number of items after filtering 4
These are texts under https://www.afdb.org/en/documents/compliance-reviews/
number of items grabed are 9
number of items after filtering 0
These are texts under https://www.afdb.org/en/documents/legal-documents/partnership-agreements/
n

www.cdcgroup.com  and  twitter.com  may not be relevent
These are texts under https://www.cdcgroup.com/en/sitemap/
number of items grabed are 14
number of items after filtering 5
These are texts under https://www.cdcgroup.com/en/how-we-invest/
number of items grabed are 27
number of items after filtering 7
These are texts under https://www.cdcgroup.com/en/about/our-history/
number of items grabed are 91
number of items after filtering 20
These are texts under https://www.cdcgroup.com/en/contact/
number of items grabed are 45
number of items after filtering 5
These are texts under https://www.cdcgroup.com/en/news-insight/latest-news/
number of items grabed are 26
number of items after filtering 5
These are texts under https://www.cdcgroup.com/en/news-insight/news/putting-people-first-international-safeguarding-summit/
number of items grabed are 23
number of items after filtering 10
These are texts under https://www.cdcgroup.com/en/news-insight/insight/
number of items grabed are 26
numb

url: https://www.cdcgroup.com/en/news-insight/news/cdc-appoints-new-deputy-chief-investment-officer-for-higher-risk-strategies/ invalid
url: https://www.cdcgroup.com/en/how-we-invest/investment-strategy/ invalid
url: https://www.cdcgroup.com/en/news-insight/insight/articles/ceo-blog-sierra-leone/ invalid
url: https://www.cdcgroup.com/en/home/im-looking-to-grow-my-business/ invalid
url: https://www.cdcgroup.com/en/how-we-invest/how-we-partner-with-businesses/ invalid
url: https://www.cdcgroup.com/en/how-we-invest/how-we-partner-with-businesses/initiatives/medaccess/ invalid
url: https://www.cdcgroup.com/en/our-investments/search-results/ invalid
url: https://www.cdcgroup.com/en/how-we-invest/investment-strategy/products/ invalid
url: https://www.cdcgroup.com/our-investments/investment-stories/ invalid
starting to crawl main url:  https://www.eib.europa.eu
main_url:  https://www.eib.europa.eu  is not valid
starting to crawl main url:  https://www.fmo.nl


these are the children links we 

These are texts under https://www.norfund.no/videos-and-stories/investing-for-jobs-article13147-1111.html
number of items grabed are 6
These are texts under https://www.norfund.no/financial-institutions/category1050.html
number of items after filtering 0
number of items grabed are 22
number of items after filtering 9
These are texts under https://www.norfund.no/what-we-do/
number of items grabed are 22
number of items after filtering 9
number of items grabed are 15
number of items after filtering 5
url: https://www.linkedin.com/company/norfund invalid
These are texts under https://www.norfund.no/newsarchive/norfund-supports-africinvest-s-second-close-of-five-article13569-1011.html
number of items grabed are 9
number of items after filtering 3
These are texts under https://www.norfund.no/food-and-agribusiness/category1051.html
number of items grabed are 19
number of items after filtering 8
These are texts under https://www.norfund.no/clean-energy/category1049.html
number of items grabed

These are texts under http://www.proparco.fr/fr/les-codeurs-de-la-silicon-savannah
number of items grabed are 39
number of items after filtering 16
www.proparco.fr  and  twitter.com  may not be relevent
www.proparco.fr  and  twitter.com  may not be relevent
These are texts under https://www.proparco.fr/fr/offre-de-emploi
number of items grabed are 12
number of items after filtering 0
These are texts under http://www.proparco.fr/fr/fefisol-combiner-financements-et-accompagnement
number of items grabed are 18
number of items after filtering 6
These are texts under http://www.proparco.fr/fr/nakheel-palestine-les-fruits-de-lavenir
number of items grabed are 21
number of items after filtering 3
www.proparco.fr  and  www.youtube.com  may not be relevent
www.proparco.fr  and  twitter.com  may not be relevent
These are texts under http://www.proparco.fr/fr/panorama-le-rapport-dactivite-2017-de-proparco-est-disponible
number of items grabed are 24
number of items after filtering 6
www.proparco.

number of items grabed are 23
number of items after filtering 25
number of items after filtering 16
number of items grabed are 23
number of items grabed are 31
number of items after filtering 23
number of items after filtering 12
These are texts under https://www.gatesfoundation.org/What-We-Do/Global-Development/Integrated-Delivery
number of items grabed are 38
number of items after filtering 27
These are texts under https://www.gatesfoundation.org/How-We-Work/General-Information/Grantseeker-FAQ
These are texts under https://www.gatesfoundation.org/What-We-Do/Global-Policy/Global-Education-Learning-Strategy
number of items grabed are 24
number of items grabed are 22
number of items after filtering 19
number of items after filtering 13
These are texts under https://www.gatesfoundation.org/How-We-Work/General-Information/Information-Sharing-Approach
number of items grabed are 19
number of items after filtering 10
These are texts under https://www.gatesfoundation.org/What-We-Do/Global-Dev

number of items grabed are 15
number of items after filtering 4
www.gatesfoundation.org  and  www.impatientoptimists.org  may not be relevent
These are texts under https://www.gatesfoundation.org/What-We-Do/Global-Health/Discovery-and-Translational-Sciences
number of items grabed are 46
number of items after filtering 39
These are texts under https://www.gatesfoundation.org/What-We-Do/Global-Development/Emergency-Response
These are texts under https://www.gatesfoundation.org/What-We-Do/Global-Development/Vaccine-Delivery
number of items grabed are 28
number of items grabed are 42
number of items after filtering 18
number of items after filtering 34
These are texts under https://www.gatesfoundation.org/How-We-Work/Quick-Links/Grants-Database
number of items grabed are 8
number of items after filtering 2
These are texts under https://www.gatesfoundation.org/What-We-Do/Global-Policy/Development-Policy-Finance
number of items grabed are 22
number of items after filtering 16
These are texts

These are texts under https://www.lundinfoundation.org/new-page/
number of items grabed are 7
number of items after filtering 5
These are texts under https://www.lundinfoundation.org/resource-development-sdg/
number of items grabed are 3
number of items after filtering 2
These are texts under https://www.lundinfoundation.org/the-foundation/
number of items grabed are 6
number of items after filtering 4
These are texts under https://www.lundinfoundation.org/portfolio-summary/
number of items grabed are 3
number of items after filtering 1
These are texts under https://www.lundinfoundation.org/management/
number of items grabed are 19
number of items after filtering 16
These are texts under https://www.lundinfoundation.org/our-challenge/
number of items grabed are 7
number of items after filtering 5
starting to crawl main url:  https://www.omidyar.com


these are the children links we crawled
['https://www.facebook.com/OmidyarNetwork', '/financial-documents', '/our-work/impact-investing',

number of items after filtering 10
These are texts under https://www.omidyar.com/our-work/property-rights
number of items grabed are 12
number of items after filtering 11
These are texts under https://www.omidyar.com/offices/silicon-valley
number of items grabed are 1
number of items after filtering 1
These are texts under https://www.omidyar.com/our-work/digital-identity
number of items grabed are 11
number of items after filtering 10
These are texts under https://www.omidyar.com/offices
number of items grabed are 9
number of items after filtering 5
These are texts under https://www.omidyar.com/offices/london
number of items grabed are 1
number of items after filtering 1
These are texts under https://www.omidyar.com/blog/new-approach-new-frontier
number of items grabed are 31
number of items after filtering 22
url: https://www.omidyar.com//twitter.com/jimmychen invalid
These are texts under https://www.omidyar.com/contact
number of items grabed are 14
number of items after filtering 1

number of items after filtering 34
These are texts under https://www.rockefellerfoundation.org/our-work/bellagio-center/conferences/
number of items grabed are 19
number of items after filtering 12
These are texts under https://www.rockefellerfoundation.org/insights/insights-detail/
number of items grabed are 13
number of items after filtering 6
These are texts under https://www.rockefellerfoundation.org/changing-work/
number of items grabed are 17
number of items after filtering 11
These are texts under https://www.rockefellerfoundation.org/terms-of-use/
number of items grabed are 51
number of items after filtering 31
These are texts under https://www.rockefellerfoundation.org/about-us/
number of items grabed are 16
number of items after filtering 7
These are texts under https://www.rockefellerfoundation.org/about-us/our-history/
number of items grabed are 244
number of items after filtering 9
url: https://www.linkedin.com/company/the-rockefeller-foundation invalid
www.rockefellerfoun

These are texts under http://www.tonyelumelufoundation.org/about-tef-programme/
number of items grabed are 4
number of items after filtering 4
These are texts under http://www.tonyelumelufoundation.org/about/
number of items grabed are 6
number of items after filtering 6
tonyelumelufoundation.org  and  twitter.com  may not be relevent
tonyelumelufoundation.org  and  twitter.com  may not be relevent
try succeeded
tonyelumelufoundation.org  and  www.youtube.com  may not be relevent
tonyelumelufoundation.org  and  twitter.com  may not be relevent
tonyelumelufoundation.org  and  twitter.com  may not be relevent
tonyelumelufoundation.org  and  www.facebook.com  may not be relevent
These are texts under http://www.tonyelumelufoundation.org/contact/
number of items grabed are 4
number of items after filtering 0
These are texts under http://www.tonyelumelufoundation.org/support/
number of items grabed are 2
number of items after filtering 1
These are texts under http://www.tonyelumelufoundatio

url: https://www.abraaj.commailto: abraaj@ky.pwc.com invalid
url: https://www.abraaj.commailto: ABRAAJ@deloitte.com invalid
These are texts under https://www.abraaj.com/insights/news/press-releases/the-abraaj-group-acquires-a-stake-in-indorama-fertilizers
number of items grabed are 8
number of items after filtering 8
These are texts under https://www.abraaj.com/insights/news/press-releases/The-Abraaj-Group-invests-in-Turkish-online-travel-agent-Biletall+
number of items grabed are 13
number of items after filtering 12
These are texts under https://www.abraaj.com/sitemap/
These are texts under https://www.abraaj.com/impact-investing/clean-energy
These are texts under https://www.abraaj.com/insights/news/press-releases/the-abraaj-group-to-acquire-java-house-from-ecp/
number of items grabed are 2
number of items grabed are 12
number of items grabed are 15
number of items after filtering 7
number of items after filtering 14
These are texts under https://www.abraaj.com/business/growth-marke

number of items grabed are 9
number of items after filtering 8
These are texts under https://www.abraaj.com/insights/blogs/insightsblogsmeet-myriam-ben-salah-curator-10th-edition-abraaj-group-art-prize/
number of items grabed are 15
number of items after filtering 9
These are texts under https://www.abraaj.com/insights/news/press-releases/abraaj-holdings-abraaj-investment-management-limited-and-colony-capital-inc-successfully-agree-on-transaction/
number of items grabed are 2
number of items after filtering 1
These are texts under https://www.abraaj.com/insights/news/21st-century-business-herald-daily-exclusive-interview-with-arif-naqvi/
number of items grabed are 4
These are texts under https://www.abraaj.com/insights/white-papers
number of items after filtering 3
number of items grabed are 20
number of items after filtering 14
These are texts under https://www.abraaj.com/insights/blogs/deal-view-omar-syed-netlog/
number of items grabed are 17
number of items after filtering 14
These 

main_url:  https://www.acumenfund.org  is not valid
starting to crawl main url:  https://www.agrivie.com


these are the children links we crawled
['http://agrivie.com/investors-list/#investor-178', 'http://agrivie.com/investment-approach/', 'http://agrivie.com/investors-list/#investor-204', 'http://agrivie.com/investors-list/#investor-193', 'https://agrivie.com/about-agri-vie/', 'https://agrivie.com/media-center/news/', 'https://agrivie.com/investment-approach/investment-process/', 'https://agrivie.com/media-center/perspectives/', 'http://agrivie.com/investors-list/#investor-192', 'http://agrivie.com/investments/', 'http://agrivie.com/investors-list/#investor-187', 'https://agrivie.com/investment-approach/', 'https://agrivie.com/investment-approach/partnership-commitment/', 'http://agrivie.com/investors-list/#investor-191', 'http://agrivie.com/wp-content/themes/agrivie/pdfviewer.php?url=http://agrivie.com/wp-content/uploads/2017/02/AgriVie-II-firstclosemedia-release-FINAL-1Feb2017.pdf

url: http://unashamedlyethical.com/Home/default.asp invalid
www.ariyacapital.com  and  www.swedenabroad.se  may not be relevent
www.ariyacapital.com  and  www.duke-energy.com  may not be relevent
www.ariyacapital.com  and  sps.africa  may not be relevent
url: https://www.ariyacapital.commailto:info@mk-africa.com invalid
url: http://www.brownadvisory.com/newsandmedia/Agenda/AdjepongBoateng/tabid/515/Default.aspx invalid
starting to crawl main url:  https://www.bamboofinance.com
main_url:  https://www.bamboofinance.com  is not valid
starting to crawl main url:  https://www.blueorchard.com


these are the children links we crawled
['https://www.blueorchard.com/unctad-world-investment-forum-2/', 'https://www.blueorchard.com/investment-solutions/blended-finance-mandates/', 'mailto:Erik.Geurts@blueorchard.com', 'https://www.blueorchard.com', 'https://www.blueorchard.com/bildungsfonds-fur-afrika-begrust-neue-offentliche-und-private-investoren/', 'https://www.blueorchard.com/european-microfina

These are texts under https://www.blueorchard.com/blueorchard-celebrates-20th-anniversary-flagship-fund/
number of items grabed are 35
number of items after filtering 11
These are texts under https://www.blueorchard.com/about-us/contributors/
number of items grabed are 34
number of items after filtering 8
www.blueorchard.com  and  www.nzz-libro.ch  may not be relevent
These are texts under https://www.blueorchard.com/8149-2/
number of items grabed are 31
number of items after filtering 3
url: https://www.blueorchard.comjavascript: void(0); invalid
These are texts under https://www.blueorchard.com/about-us/blue-orchard/
number of items grabed are 24
number of items after filtering 3
url: https://www.blueorchard.commailto:Nicholas.Gandolfo@blueorchard.com invalid
These are texts under https://www.blueorchard.com/investment-solutions/blended-finance-mandates/mifa/
number of items grabed are 51
number of items after filtering 13
These are texts under https://www.blueorchard.com/impact-inve

number of items grabed are 41
number of items after filtering 5
These are texts under https://www.blueorchard.com/investment-solutions/blueorchard-bond-fund/
number of items grabed are 57
number of items after filtering 15
These are texts under https://www.blueorchard.com/sustainable-investing-emerging-markets-summit/
number of items grabed are 20
number of items after filtering 3
starting to crawl main url:  https://www.catalystprincipal.com


these are the children links we crawled
['https://www.catalystprincipal.com/kenyan-pe-firm-raises-103mn-in-second-round-funding/', 'https://www.catalystprincipal.com/news/', 'tel:+254204296000', 'https://www.catalystprincipal.com/company/', 'https://www.catalystprincipal.com/burbridge-newsletter-interview-with-biniam-yohannes/', 'https://www.catalystprincipal.com/sustainability/', 'https://www.catalystprincipal.com/catalyst-acquires-a-strategic-interest-in-kensta-group/', 'https://www.catalystprincipal.com/fund/', 'mailto:invest@catalystprincipa

These are texts under http://www.grofin.com/language/en/legal-notices/
number of items grabed are 6
number of items after filtering 2
These are texts under http://www.grofin.com/language/en/business_support/#section2_howweworkwithsmes
number of items grabed are 26
number of items after filtering 16
These are texts under http://www.grofin.com/language/en/business_support/#section1_supportbeyondfinance
number of items grabed are 26
number of items after filtering 16
These are texts under http://www.grofin.com/language/en/business_support/#section3_financeexpertisesuccess
number of items grabed are 26
number of items after filtering 16
These are texts under http://www.grofin.com/language/en/entrepreneurs/#section5_howwefinance
These are texts under http://www.grofin.com/language/en/about_grofin/#section7_ourinvestorsandfunders
number of items grabed are 32
number of items grabed are 40
number of items after filtering 16
These are texts under http://www.grofin.com/language/en/entrepreneurs

number of items grabed are 53
number of items after filtering 15
These are texts under https://www.heartcapital.co.za/get-involved
number of items grabed are 41
number of items after filtering 7
www.heartcapital.co.za  and  www.facebook.com  may not be relevent
These are texts under https://www.heartcapital.co.za/legacy-gallery
number of items grabed are 163
number of items after filtering 74
starting to crawl main url:  https://www.ignite-fund.org
main_url:  https://www.ignite-fund.org  is not valid
starting to crawl main url:  https://www.iachl.com


these are the children links we crawled
['parent.php?root=portfolio&link=overview', 'http://www.agra-alliance.org', 'parent.php?root=join&link=experts', 'parent.php?root=approach&link=strategy', 'parent.php?root=portfolio&link=profiles', 'language.php?lang=fr', 'parent.php?root=about&link=overview', 'http://www.tagedstudio.com', 'parent.php?root=join&link=investors', 'parent.php?root=approach&link=criteria', 'parent.php?root=portfolio&li

In [121]:
database.to_csv("crawled_database.csv")

array(['Agriculture and agribusiness accounts for 32 percent of GDP in Sub-Saharan Africa and employs 65 percent of the work force. It is likely to have a bigger impact on poverty reduction than in other sectors as it offers the most direct route of raising returns to poor people’s main assets, i.e. land and labour. The AECF’s work in the agriculture sector is the oldest and largest of its investments with a total of US$ 183 million committed across Africa. The Fund supports businesses to establish a new activity or expand existing businesses across the value chain, as long as the primary beneficiaries of the businesses are rural households living on less than US$ 2 per day.\xa0',
       'Accessible Innovations for Smallholders:\xa0The Fund invests in businesses that have the potential to reach smallholder farmers to improve agricultural productivity and increase household income and food security. We work in a wide range of value chains and production systems to increase access to bet

In [117]:
database

Unnamed: 0,Organisation,Website
1,The Africa Enterprise Challenge Fund (AECF),http://www.aecfafrica.org/
2,Alliance for a green revolution in Africa (AGRA),http://agra.org/grants/
3,Global Innovation Fund,http://www.globalinnovation.fund
4,Acumen Fund,http://acumen.org/
5,Bamboo Finance,http://www.bamboocp.com/
6,African Development Fund,http://www.afdb.org/en/about-us/corporate-info...
7,Willow Impact,http://www.willowimpact.com/
8,Vista Ventures Social Impact Fund,http://www.vistaventures.com/
9,Grayghost ventures,http://www.grayghostventures.com/indexa.html
10,Business/Partners Investing in Entrepreneurs (...,http://businesspartners.co.ke/


In [None]:
class InvestorCrawler():
    def __init__(self):
        

In [5]:
class Searcher():
    def __init__(self):
        # prepare string cleaner
        # store stop words
        self.stop_words = stop_words.get_stop_words("en")
        # prepare stemmer
        self.stemmer = PorterStemmer()
        
        # get and process database
        try:
            self._database = pd.read_csv('crawled_database.csv').iloc[:, [1, 2, 3]]
            print("load crawled database successful")
        except:
            # if no crawled database given
            # load the dataset : including only each company's name, url and summary
            print('fail to load crawled database')
            self._database = pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, [1, 5, 6]]
            self.crawl_database()
        self.process_database()
    
    def clean_string(self, string):
        """
        Clean input string by 
        1. removing all non alphabet letters
        2. removing all stop-words
        2. stemming all tokens
        """
        # remove non alphabet numbers
        string = re.sub("[^a-z]*", '', string.lower())
        
        # stem the string
        string = self.stemmer.stem_sentence(string)
        
        # remove stop words
        string = "".join([token if token not in self.stop_words for token in string.split()])
        
        return string
        
    def process_database(self):
        # 1: company name, 5: company website, 6: company manual desc
        raw_texts = []
        # preprocess all the text data and remove any row without any useful data, and segment each word
        drop_list = []
        for row in self._database.itertuples():
            if not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str):
                # check if the row has data
                drop_list.append(row[0])
            else:
                # process text data of both manually summarized or crawled data
                tmp_text = []
                for col in [2, 3]:
                    real_col = col - 1
                    text = row[col]
                    if type(text) is str:
                        text = self.word_tokenize_string(text)
                        self._database.iloc[row[0], real_col] = text
                        tmp_text.append(text)
                # merge texts of same company
                tmp_text = '    '.join(tmp_text)
                raw_texts.append(tmp_text)
                
        # drop all the rows that do not have essential data
        self._database.drop(drop_list, inplace=True)
        # create similarity col for similarity search use
        self._database = self._database.assign(similarity=np.zeros(len(self._database)))
        
        # use the raw_texts to generate tfidf model
        self._tfidf, self._dictionary = self.get_tfidf_and_dictionary(raw_texts)
        
    def crawl_database(self):
        for row in self._database.itertuples():
            if not (not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str)):
                # process each website and replace web address with texts crawled
                url = row[2]
                texts = self.get_text_from_url_and_its_children(url)
                if not texts:
                    # if cannot access url, replace url with Nan
                    self._database.iloc[row[0], 1] = np.nan
                else:
                    # replace the url with the crawled texts
                    texts = '   '.join(texts)
                    self._database.iloc[row[0], 1] = texts
        
    def save_database(self):
        self._database.to_csv('crawled_database.csv')
        print("database save successful")
    
    def update_similarity(self, input_text, col=2):
        # get input text vector
        input_text_vector = self.get_doc_vector(input_text)
        i = 0
        for row in self._database.itertuples():
            row_text_vector = self.get_doc_vector(row[col])
            similarity = input_text_vector.dot(row_text_vector)
            self._database.iloc[i, -1] = similarity
            i += 1
        self._database = self._database.sort_values(by='similarity', ascending=False)
        return self._database

    def get_doc_vector(self, text):
        if not text == text:
            return self._w2v['happy'] * 0
        tokens = list(self._dictionary.token2id)
        # convert any unknown word to known word
        new_text = []
        for word in text.split():
            if word in tokens:
                new_text.append(word)
            elif word in w2v: # replace the unknow word with the most similar word in tokens of dictionary
                new_text.append(self._w2v.most_similar_to_given(word_list=tokens, w1=word))

        # start to calculate vector using tfidf weighted word vector sum
        # get tfidf weight
        tokenized_text = [self._dictionary.doc2bow(new_text)]
        tfidf_text = self._tfidf[tokenized_text][0]
        # sum weighted word vectors
        sum_vector = self._w2v['happy'] * 0 # get the size of the word vector
        for word_id, weight in tfidf_text:
            word = self._dictionary[word_id]
            sum_vector += self._w2v[word] * weight
        if sum_vector.any():
            sum_vector /= np.sqrt(sum_vector.dot(sum_vector)) # normalize the vector
            
        return sum_vector
    
    def word_tokenize_string(self, text):
        stop_words = get_stop_words('en') # get too frequent word
        text = text.replace('\r', ' ').replace('\n', ' ') # remove symbols
        text = re.sub(r"http\S+", "", text) # remove urls
        # remove any word that present too frequently or cannot be converted to word vector
        text = [word for sent in sent_tokenize(text.lower()) for word in word_tokenize(sent) \
                if not word in stop_words and word in self._w2v]
        return ' '.join(text)
    
    @staticmethod
    def get_tfidf_and_dictionary(texts):
        # get dictionary of texts
        texts = [text.split() for text in texts]
        dictionary = corpora.Dictionary(texts)

        # get tfidf ranking model
        tokenized_texts = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(tokenized_texts)

        return tfidf, dictionary
    
    def get_text_from_url_and_its_children(self, main_url):
        print("starting to crawl main url: ", main_url)
        # check validity of main_url
        resp = url_is_valid(main_url)
        if not resp:
            print("main_url is not valid")
            return False

        print("\nstarting to crawl all its children")
        # grab all urls in this web page
        urls = [main_url]
        urls.extend(get_urls_from_url(main_url))
        urls = list(set(urls)) # remove duplicated urls
        print("\n\nthese are the children links we crawled")
        print(urls, "\n")
        # grab all texts in each urls asynchronously
        # argmumentize urls
        urls = [(url, main_url) for url in urls]
        with multiprocessing.Pool(processes=24) as pool:
            text_data = pool.starmap(get_text_from_url_with_check, urls) 
            # try terminating hung jobs
        text_data = [text for text in text_data if len(text_data) > 0] # remove empty returns
        text_data = [text for text_list in text_data for text in text_list] # get list elements to str
        return text_data

SyntaxError: invalid syntax (<ipython-input-5-1cc40a161c8c>, line 35)

In [100]:
searcher = Searcher(w2v=w2v)
# there might be main_url + url is not valid, because url and main_url has overlaps, or main_url is not the root
# must provide root url, or 
# use overlaps to do intelligent main_url + url

load crawled database successful


In [101]:
input_text = "new start up aiming at low income customers, dedicated in green energy"

In [102]:
%time searcher.update_similarity(input_text, col=2)

CPU times: user 47.2 s, sys: 26.3 ms, total: 47.3 s
Wall time: 47.3 s


Unnamed: 0,Organisation,Website,Description,similarity
94,Technoserve,job us important one . sign learn can help us ...,nonprofit organization develops business solut...,0.878591
35,Rockefeller Foundation,"first american red present-day initiatives , r...",rockefeller foundation pioneering organization...,0.874697
86,Dalberg Global Development Advisors,entrepreneurship going create mass-scale jobs ...,strategic advisory firm works clients maximise...,0.874492
1,Alliance for a green revolution in Africa (AGRA),division operates understanding agricultural t...,agra aims invest projects can measurable impac...,0.873631
14,Novastar,"amy bell , executive director social finance j...",", based nairobi , venture catalyst firm assist...",0.873388
100,Southern African Impact Investing Network (SAIIN),"move immediately right hand lane , turn right ...",aims promote concept practice impact investing...,0.873245
89,Impact Amplifier,impact amplifier developed core set services a...,impact amplifier consulting firm works acceler...,0.872664
22,Netherlands Development Finance Company (FMO),bank 45 years experience investing private sec...,entrepreneurial development bank offering fina...,0.872290
6,Willow Impact,"trademarks , logos service marks displayed sit...",impact investment firm manages advises social ...,0.871983
10,LGT Venture Philanthropy Foundation,"venture philanthropy , provide philanthropic c...",seek support non-profit organizations for-prof...,0.870239
