In [49]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from gensim import models
from gensim import corpora
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Phrases
from gensim.models.fasttext import FastText
from gensim.models.wrappers.fasttext import FastText as FT_wrapper
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from random import shuffle
import time
import re
import pylab as pl
from ipywidgets import FloatProgress
from IPython import display
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
class Searcher():
    def __init__(self, w2v=None):
        # load w2v modle
        if w2v is None:
            print("start loading w2v, this might take a while")
            self._w2v = KeyedVectors.load_word2vec_format("../w2v/model/wiki-news-300d-1M.vec")
        else:
            self._w2v = w2v
        
        # get and process database
        self.get_database()
        
    def get_database(self):
        # load the dataset : including only each company's name, url and summary
        self._database = pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, [1, 5, 6]]
        # 1: company name, 5: company website, 6: company manual desc
        raw_texts = []
        # preprocess all the text data and remove any row without any useful data, and segment each word
        drop_list = []
        for row in self._database.itertuples():
            if not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str):
                # check if the row has data
                drop_list.append(row[0])
            else:
                # process each website and replace web address with texts crawled
                url = row[2]
                texts = self.get_text_from_url_and_its_children(url)
                if not texts:
                    # if cannot access url, replace url with Nan
                    self._database.iloc[row[0], 1] = np.nan
                else:
                    # replace the url with the crawled texts
                    texts = '   '.join(texts)
                    self._database.iloc[row[0], 1] = texts
                    
                # process text data of both manually summarized or crawled data
                tmp_text = []
                for col in [2, 3]:
                    real_col = col - 1
                    text = row[col]
                    if type(text) is str:
                        text = self.word_tokenize_string(text)
                        self._database.iloc[row[0], real_col] = text
                        tmp_text.append(text)
                # merge texts of same company
                tmp_text = '    '.join(tmp_text)
                raw_texts.append(tmp_text)
                
        # drop all the rows that do not have essential data
        self._database.drop(drop_list, inplace=True)
        # create similarity col for similarity search use
        self._database = self._database.assign(similarity=np.zeros(len(database)))
        
        # use the raw_texts to generate tfidf model
        self._tfidf, self._dictionary = get_tfidf_and_dictionary(raw_texts)
        
        return
    
    def update_similarity(self, input_text, col=1):
        # get input text vector
        input_text_vector = self._get_doc_vector(input_text)
        i = 0
        for row in database.itertuples():
            row_text_vector = self._get_doc_vector(row[col+1])
            similarity = input_text_vector.dot(row_text_vector)
            database.iloc[i, -1] = similarity
            i += 1
        self._database = self._database.sort_values(by='similarity', ascending=False)
        return database

    def get_doc_vector(self, text):
        tokens = list(self._dictionary.token2id)
        # convert any unknown word to known word
        new_text = []
        for word in text.split():
            if word in tokens:
                new_text.append(word)
            elif word in w2v: # replace the unknow word with the most similar word in tokens of dictionary
                new_text.append(self._w2v.most_similar_to_given(word_list=tokens, w1=word))

        # start to calculate vector using tfidf weighted word vector sum
        # get tfidf weight
        tokenized_text = [self._dictionary.doc2bow(new_text)]
        tfidf_text = self._tfidf[tokenized_text][0]
        # sum weighted word vectors
        sum_vector = self._w2v['happy'] * 0 # get the size of the word vector
        for word_id, weight in tfidf_text:
            word = self._dictionary[word_id]
            sum_vector += self._w2v[word] * weight
        sum_vector /= np.sqrt(sum_vector.dot(sum_vector)) # normalize the vector

        return sum_vector
    
    def word_tokenize_string(self, text):
        stop_words = get_stop_words('en') # get too frequent word
        text = text.replace('\r', ' ').replace('\n', ' ') # remove symbols
        text = re.sub(r"http\S+", "", text) # remove urls
        # remove any word that present too frequently or cannot be converted to word vector
        text = [word for sent in sent_tokenize(text.lower()) for word in word_tokenize(sent) \
                if not word in stop_words and word in self._w2v]
        return ' '.join(text)
    
    @staticmethod
    def get_tfidf_and_dictionary(texts):
        # get dictionary of texts
        texts = [text.split() for text in texts]
        dictionary = corpora.Dictionary(texts)

        # get tfidf ranking model
        tokenized_texts = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(tokenized_texts)

        return tfidf, dictionary
    
    def get_text_from_url_and_its_children(self, main_url):
        print("starting to crawl main url: ", main_url)
        # check validity of main_url
        resp = self.url_is_valid(main_url)
        if not resp:
            print("main_url is not valid")
            return False

        text_data = []
        print("\nstarting to crawl all its children")
        # grab all urls in this web page
        urls = [main_url]
        urls.extend(self.get_urls_from_url(main_url))
        print("\n\nthese are the children links we crawled")
        print(urls, "\n")
        # grab all texts in each urls
        for url in set(urls):
            resp = self.url_is_valid(url)
            if not resp:
                resp = self.url_is_valid(main_url + url)
                if not self.url_is_valid(main_url + url):
                    print("url:", main_url + url, "invalid")
                    continue
            print(resp.url)
            # check if url is the child or sibling of main_url
            if self.url_compare(main_url, resp.url) < 10: # to avoid http://www.
                print('\nurl:', resp.url, 'might be irrelevent to', main_url, 'quit visiting\n')
                continue
            for text in self.get_texts_from_resp(resp):
                text_data.append(text)
        return text_data
        
    @staticmethod
    def get_urls_from_url(main_url):
        resp = requests.get(main_url)
        soup = BeautifulSoup(resp.content, 'html.parser')
        urls = []
        links = soup.find_all('a')
        for url in links:
            try:
                url = url.attrs['href']
                if len(url) > 5:
                    urls.append(url)
            except:
                pass
        return urls
    
    @staticmethod
    def get_texts_from_resp(resp):
        # parse the web response
        soup = BeautifulSoup(resp.content, 'html.parser')
        # find and filter texts
        print("These are texts under", resp.url)
        texts = soup.find_all('p')
        print("number of items grabed are", len(texts))
        texts = [text for text in texts if len(text.text) > 100]
        print("number of items after filtering", len(texts))
        # output texts
        for text in texts:
            print(text.text)
            yield text.text
    
    @staticmethod
    def url_is_valid(url):
        try:
            resp = requests.get(url)
            assert resp.status_code == 200
            return resp
        except:
            return False

    @staticmethod
    def url_compare(url_target, url_income):
        n_same_letter = 0.0
        # delete all http or https 
        if url_target[4] == 's':
            url_target = url_target[5:]
        else:
            url_target = url_target[4:]
        if url_income[4] == 's':
            url_income = url_income[5:]
        else:
            url_income = url_income[4:]
        # check similarity
        min_len = min(len(url_target), len(url_income))
        for i in range(min_len-1):
            if url_target[i] == url_income[i]:
                n_same_letter += 1
            else:
                break
        return n_same_letter

In [None]:
searcher = Searcher(w2v=w2v)

These are texts under https://www.aecfafrica.org/
number of items grabed are 9
number of items after filtering 5
The portfolio is demonstrating how renewable energy technologies
have the potential to reach Africa’s rural poor.
Agriculture and Agribusiness accounts for 32% of GDP in sub-Saharan
Africa and employs 65% of the work force.
The Africa Enterprise Challenge Fund (AECF) is a US $304 million private sector fund that provides catalytic funding to enterprises in 24 countries across sub-Saharan Africa.


Agriculture and agribusiness accounts for 32 percent of GDP in Sub-Saharan Africa and employs 65 percent of


....
Renewable Energy and Adaptation to Climate Technologies (REACT) window is demonstrating how renewable energ


....

finish crawling texts from main url

starting to crawl all its children


these are the children links we crawled
['/about-us/who-we-are', '/about-us/our-history', '/the-aecf-board', '/the-aecf-management', '/about-us/funding-partners', '/about-us/strateg

These are texts under https://www.aecfafrica.org/index.php
number of items grabed are 9
number of items after filtering 5
The portfolio is demonstrating how renewable energy technologies
have the potential to reach Africa’s rural poor.
https://www.aecfafrica.org/index.php
Agriculture and Agribusiness accounts for 32% of GDP in sub-Saharan
Africa and employs 65% of the work force.
https://www.aecfafrica.org/index.php
The Africa Enterprise Challenge Fund (AECF) is a US $304 million private sector fund that provides catalytic funding to enterprises in 24 countries across sub-Saharan Africa.


https://www.aecfafrica.org/index.php
Agriculture and agribusiness accounts for 32 percent of GDP in Sub-Saharan Africa and employs 65 percent of


....
https://www.aecfafrica.org/index.php
Renewable Energy and Adaptation to Climate Technologies (REACT) window is demonstrating how renewable energ


....
https://www.aecfafrica.org/index.php
These are texts under https://www.aecfafrica.org/index.php
num

These are texts under https://www.aecfafrica.org/index.php
number of items grabed are 9
number of items after filtering 5
The portfolio is demonstrating how renewable energy technologies
have the potential to reach Africa’s rural poor.
https://www.aecfafrica.org/index.php
Agriculture and Agribusiness accounts for 32% of GDP in sub-Saharan
Africa and employs 65% of the work force.
https://www.aecfafrica.org/index.php
The Africa Enterprise Challenge Fund (AECF) is a US $304 million private sector fund that provides catalytic funding to enterprises in 24 countries across sub-Saharan Africa.


https://www.aecfafrica.org/index.php
Agriculture and agribusiness accounts for 32 percent of GDP in Sub-Saharan Africa and employs 65 percent of


....
https://www.aecfafrica.org/index.php
Renewable Energy and Adaptation to Climate Technologies (REACT) window is demonstrating how renewable energ


....
https://www.aecfafrica.org/index.php
These are texts under https://www.aecfafrica.org/index.php
num


url: https://www.facebook.com/AECFAfrica/ might be irrelevent to http://www.aecfafrica.org/ quit visiting

These are texts under https://www.aecfafrica.org/index.php
number of items grabed are 9
number of items after filtering 5
The portfolio is demonstrating how renewable energy technologies
have the potential to reach Africa’s rural poor.
https://www.aecfafrica.org/index.php
Agriculture and Agribusiness accounts for 32% of GDP in sub-Saharan
Africa and employs 65% of the work force.
https://www.aecfafrica.org/index.php
The Africa Enterprise Challenge Fund (AECF) is a US $304 million private sector fund that provides catalytic funding to enterprises in 24 countries across sub-Saharan Africa.


https://www.aecfafrica.org/index.php
Agriculture and agribusiness accounts for 32 percent of GDP in Sub-Saharan Africa and employs 65 percent of


....
https://www.aecfafrica.org/index.php
Renewable Energy and Adaptation to Climate Technologies (REACT) window is demonstrating how renewable ener

These are texts under https://acumen.org/approach/
number of items grabed are 21
number of items after filtering 6
The markets alone cannot solve the problems of poverty; nor are charity and aid enough to tackle the challenges faced by over two-thirds of the world’s population living in poverty. Patient capital is a third way that seeks to bridge the gap between the efficiency and scale of market-based approaches and the social impact of pure philanthropy.
https://acumen.org/approach/
Rather than investing traditional capital, we invest philanthropic, or “patient,” capital that provides startups with the flexibility and security to grow their business and reach as many poor customers as possible.
https://acumen.org/approach/
We make seed and early-stage investments that enable social enterprises to validate assumptions, bring products and services to market, and begin to scale.
https://acumen.org/approach/
We provide our companies with access to our expertise and networks of advisors w

These are texts under https://acumen.org/anti-corruption-policy/
number of items grabed are 16
number of items after filtering 5
In making patient capital investments, Acumen is committed to the highest ethical standards. Acumen wants to work with entrepreneurs who aim to build ethical businesses. We have a zero tolerance policy for businesses or people that do not obey the letter and spirit of all relevant laws, particularly in the areas of corruption, tax-fraud, terrorism and money-laundering. The letter and spirit of employee protection laws are also a major focus for Acumen.
https://acumen.org/anti-corruption-policy/
Acumen has designed its investment process to root out some of these concerns, such as by requiring periodic certifications by, and USA PATRIOT Act database searches on, investees. Each investment is unique, and we need the good judgment and efforts of our team to make the process work. Acumen also depends on input and cooperation received from others in reporting alle

These are texts under https://acumen.org/seepeople/
number of items grabed are 5
number of items after filtering 1
At Acumen, we believe in the potential of every human being. That’s why we invest in leaders and entrepreneurs creating solutions that enable people living in poverty to transform their lives. Meet the people we serve and hear their stories of agency, opportunity and dignity.
https://acumen.org/seepeople/
These are texts under https://acumen.org/fellowships/regions/
number of items grabed are 8
number of items after filtering 5
Our Fellows Program is currently offered in East Africa, India, and Pakistan. Over the course of a year, Fellows remain in their jobs while participating in five in-person, multi-day immersive seminars and engaging in online content between seminars. The program design is centered on group-based learning, self-reflection, and real-world application. Fellows receive training, practical tools and the space to explore their own leadership journeys whil

url: http://acumen.org/https://briteweb.com/ invalid
These are texts under https://acumen.org/partners/
number of items grabed are 16
number of items after filtering 7
Our partners are changemakers. They put their money where their mouths are and they’re not afraid of taking risks to create outsized impact. They share our belief that every individual, not just some, deserve the right to live with dignity, not dependence. Our partners are the individuals, corporations and institutions who provide the financial and human capital that allow us to take on the world’s toughest challenges.
https://acumen.org/partners/
Here at Acumen, we refer to our donors as “partners” because giving to Acumen is about more than a financial contribution. Our partners play an integral part in the organization, and we value the expertise and perspective that they bring to the table.
https://acumen.org/partners/
We go to great lengths to maintain transparency with our partners, sharing detailed reporting on ou


url: https://twitter.com/acumen might be irrelevent to http://acumen.org/ quit visiting


url: https://www.plusacumen.org/courses/social-entrepreneurship-101 might be irrelevent to http://acumen.org/ quit visiting

These are texts under https://acumen.org/work-with-us/
number of items grabed are 12
number of items after filtering 4
function resizeResumatorIframe(height,nojump){if(nojump== 0){window.scrollTo(0,0);}document.getElementById("resumator-job-frame").height = parseInt(height)+20;}
https://acumen.org/work-with-us/
Our Fellows Programs develop extraordinary individuals with the knowledge, support system and practical wisdom to unlock their full potential to drive social change.
https://acumen.org/work-with-us/
+Acumen offers world-class online courses for anyone anywhere to think differently, learn collaboratively and join a global community of learners and doers with a burning desire to change the world.
https://acumen.org/work-with-us/
Our portfolio companies are looking for 

These are texts under https://acumen.org/blog/forbes-names-jacqueline-novogratz-as-one-of-100-greatest-living-business-minds/
number of items grabed are 12
number of items after filtering 4
Acumen Founder and CEO Jacqueline Novogratz is featured in Forbes magazine’s special centennial issue, released today, as one of The World’s 100 Greatest Living Business Minds. In celebration of its 100th anniversary, Forbes has put together this collector’s edition featuring essays, lessons and ideas for the next 100 years from today’s most influential business leaders around the world.
https://acumen.org/blog/forbes-names-jacqueline-novogratz-as-one-of-100-greatest-living-business-minds/
In developing the list of 100, Forbes sought people who had either created something with a lasting impact on the world or innovated in a way that transcends their given field. Other honorees featured on Forbes’ 100 Greatest Living Business Minds list include: Warren Buffett, Bill Gates, Jeff Bezos, Mark Zuckerber

These are texts under http://www.bamboocp.com/
number of items grabed are 7
number of items after filtering 6
Over the last 10 years we have been addressing underserved mass-market opportunities through game-changing companies.
We have a seasoned team of pioneer investors in successive countries and sectors. Our investee companies draw on expertise we’ve gained from operational roles in both developed and emerging markets.
We make money for our investors – since inception an overall IRR in local currency of 21.9%. But our impact is also social and environmental. We have helped to create 25,000 jobs and improved 29 million lives.
Husk Power Systems, today announced a US $20 million equity investment* by Shell Technology Ventures LLC, Swedish development finance institution Swedfund International and ENGIE Rassembleurs d’Energies, ENGIE group’s impact investment fund. Husk is now positioned to accelerate its growth to develop what the International Energy Agency estimates as a $190 billi

These are texts under http://www.bamboocp.com/approach/
number of items grabed are 11
number of items after filtering 6
Our rigorous selection process ensures that our investee companies have what it takes. Then we provide capital and board-level support to help management grow value for shareholders, and make a difference to their communities.
http://www.bamboocp.com/approach/
We search out game-changing companies that can produce good returns plus social impact: investments that matter. We focus on mass-market opportunities in just three sectors: finance, energy and healthcare. Generally, but not always, our investments are in emerging economies.
http://www.bamboocp.com/approach/
We introduce international standard governance and compliance, and instill financial discipline. We often strengthen management and increase the focus on customers, working with our investee companies to create partnerships and to leverage the expertise we have gained in other markets. As a result, we speed 

These are texts under http://www.bamboocp.com/investments/
number of items grabed are 2
number of items after filtering 1
We invest in game-changing companies that can tap into mass-market opportunities. Our investments lead growth and social impact by anticipating major market trends. That approach enables us to generate impressive financial and social return. See below the list of our direct investments and select indirect.
http://www.bamboocp.com/investments/
These are texts under http://www.bamboocp.com/terms-and-conditions/
number of items grabed are 1
number of items after filtering 0
url: http://www.bamboocp.com/http://www.bamboocp.com/news/ invalid
These are texts under http://www.bamboocp.com/about/impact/
number of items grabed are 6
number of items after filtering 5
We make money for our investors. Since inception, our exits have delivered an overall IRR in local currency of 21.9%. The range is 16-22% and 1.5-1.9x cash (depending on FX effects). We have achieved these figure

url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/documents/departmental-annual-reports/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/documents/publications/african-statistical-yearbook/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/countries/west-africa/gambia/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/topics-and-sectors/initiatives-partnerships/deauville-partnership/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/knowledge/publications/african-economic-outlook/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/about-us/corporate-information/african-development-fund-adf/adf-country-resources-allocation/ invalid

url: https://www.youtube.com/user/afdbcomu might be irrelevent to h

url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/projects-and-operations/project-portfolio/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/countries/north-africa/mauritania/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/countries/west-africa/liberia/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/topics-and-sectors/topics/millennium-development-goals-mdgs/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/topics-and-sectors/initiatives-partnerships/africa50/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/documents/publications/mdg-report/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/countries/central-africa/ invalid
url: http://www

url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/topics-and-sectors/topics/quality-assurance-results/development-effectiveness-reviews/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/about-us/corporate-information/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/about-us/corporate-information/our-values/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/contact-us/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/projects-and-operations/project-cycle/loan-effectiveness/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/documents/legal-documents/loan-and-grant-conditions/ invalid
url: http://www.afdb.org/en/about-us/corporate-information/african-development-fund-adf//en/knowledge/publications/

In [46]:
a

<__main__.Searcher at 0x7f0ea7a208d0>

In [11]:
def rescue_code(function):
    import inspect
    get_ipython().set_next_input("".join(inspect.getsourcelines(function)[0]))

In [12]:
def get_database(w2v):
    # load the dataset and all the text data
    database = pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, [1, 6, 9]]
    raw_texts = []
    # preprocess all the text data and remove any row without any useful data, and segment each word
    drop_list = []
    for row in database.itertuples():
        if not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str):
            drop_list.append(row[0])
        else:
            for col in [2, 3]:
                real_col = col - 1
                text = row[col]
                if type(text) is str:
                    text = word_tokenize_string(text, w2v)
                    text = ' '.join(text)
                    database.iloc[row[0], real_col] = text
                    raw_texts.append(text)
    database.drop(drop_list, inplace=True)
    database = database.assign(similarity=np.zeros(len(database)))

    return database, raw_texts

def get_tfidf_and_dictionary(texts):
    # get dictionary of texts
    texts = [text.split() for text in texts]
    dictionary = corpora.Dictionary(texts)
    
    # get tfidf ranking model
    tokenized_texts = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(tokenized_texts)
    
    return tfidf, dictionary

def word_tokenize_string(text, w2v):
    stop_words = get_stop_words('en')
    text = text.replace('\r', ' ').replace('\n', ' ')
    text = re.sub(r"http\S+", "", text)
    text = [word for sent in sent_tokenize(text.lower()) for word in word_tokenize(sent) if not word in stop_words and word in w2v]
    return text

def update_similarity(w2v, dictionary, tfidf, input_text, database):
    # get input text vector
    input_text_vector = get_doc_vector(input_text, w2v, dictionary, tfidf)
    i = 0
    for row in database.itertuples():
        row_text_vector = get_doc_vector(row[2], w2v, dictionary, tfidf)
        similarity = input_text_vector.dot(row_text_vector)
        database.iloc[i, -1] = similarity
        i += 1
    database = database.sort_values(by='similarity', ascending=False)
    return database

def get_doc_vector(text, w2v, dictionary, tfidf):
    tokens = list(dictionary.token2id)
    # convert any unknown word to known word
    new_text = []
    for word in text.split():
        if word in tokens:
            new_text.append(word)
        elif word in w2v: # replace the unknow word with the most similar word in tokens of dictionary
            new_text.append(w2v.most_similar_to_given(word_list=tokens, w1=word))
    
    # start to calculate vector using tfidf weighted word vector sum
    # get tfidf weight
    tokenized_text = [dictionary.doc2bow(new_text)]
    tfidf_text = tfidf[tokenized_text][0]
    # sum weighted word vectors
    sum_vector = w2v['happy'] * 0 # get the size of the word vector
    for word_id, weight in tfidf_text:
        word = dictionary[word_id]
        sum_vector += w2v[word] * weight
    sum_vector /= np.sqrt(sum_vector.dot(sum_vector)) # normalize the vector
    
    return sum_vector

In [13]:
w2v = KeyedVectors.load_word2vec_format("../w2v/model/wiki-news-300d-1M.vec")
# w2v = KeyedVectors.load("../w2v/model/fasttext_w2v_vector_64")

KeyboardInterrupt: 

In [14]:
database, texts = get_database(w2v)
tfidf, dictionary = get_tfidf_and_dictionary(texts)

In [15]:
pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, :]

Unnamed: 0,No,Organisation,Type_1,Type_2,Region,Website,Description,Relevant_Products_and_Services,SectorsIndustries,Funding_details,Requirements__Eligibility,Examples,Application,Contact,Additional_Information1,Additional_Information2,Additional_Information3
0,0.0,filler,,,,,,,,,,,,,,,
1,1.0,The Africa Enterprise Challenge Fund (AECF),,,Africa,http://www.aecfafrica.org/,The AECF is an Africa-based challenge fund tha...,,,1) Challenge Fund model where funding is award...,"Vary from competition to competition, but gene...",M-Kopa\r,General Procedure\r\n\r\nBefore:\r\n1) Submit ...,"The AECF Limited,\r\n\r\nWest End Towers, Kanj...",,,
2,2.0,Alliance for a green revolution in Africa (AGRA),,,Africa,http://agra.org/grants/,AGRA aims to invest in projects that can have ...,,,No much information provided - companies are u...,,,,Nairobi\r\n\r\nTel: +254 (20) 3675 000 / +254 ...,,,
3,3.0,Global Innovation Fund,,,Africa,http://www.globalinnovation.fund,A government-sponsored non-profit fund in Lon...,,,"Amount: $230,000 max for pilot project\r\n* Pi...",More details: http://www.globalinnovation.fund...,,Accept applications all year round\r\n\r\n1) I...,http://www.globalinnovation.fund/contact-us,,,
4,4.0,Acumen Fund,,,Africa,http://acumen.org/,A charity organisation providing funding for e...,,,"Investment capital in the range of $0.25M-$3M,...",Social responsibility and demonstrated sustain...,M-Kopa\r\n\r\nD.Light (Global company; african...,Accept applications all year round\r\n\r\nMore...,Do not accept contacts prior to submission,,,
5,5.0,Bamboo Finance,,,Africa,http://www.bamboocp.com/,Bamboo Finance is a commercial private equity ...,,,,The company provides essential services afford...,,"Bamboo Finance Africa\r\n7th Floor, Purshottam...",,,,
6,6.0,African Development Fund,,,Africa,http://www.afdb.org/en/about-us/corporate-info...,The ADF contributes to the promotion of econom...,,,Areas of funding: \r\nThe African Development ...,,,East Africa Regional Resource Center (EARC)\r\...,,,,
7,7.0,Willow Impact,,,Africa,http://www.willowimpact.com/,An impact investment firm that manages and adv...,,,,"Geographic areas: Eastern Africa, the Middle E...",,http://www.willowimpact.com/contact-us/email-u...,,,,
8,8.0,Vista Ventures Social Impact Fund,,,Africa,http://www.vistaventures.com/,A California-based fund to provide education a...,,,Seek early stage companies that wish to raise ...,"By early stage, we expect that applicant compa...",,Submit:\r\n1) Business plan (or equivalent)\r\...,,,,
9,9.0,Grayghost ventures,,,Africa,http://www.grayghostventures.com/indexa.html,Seeks to eliminate poverty and strengthen comm...,,,Stage: Early-stage venture capital \r\n\r\nLoc...,Our investment approach prioritizes social ven...,M-Kopa,Submit an executive summary to\r\ninfo@graygho...,Headquarters:\r\nGray Ghost Ventures\r\n2200 C...,,,


In [16]:
input_text = "new start up aiming at low income customers, dedicated in green energy"

In [17]:
search_output = update_similarity(w2v, dictionary, tfidf, input_text, database)
list(search_output.iloc[1, :])

['Fund for Developing',
 '. invests enterprises low income countries promote business development contribute economic growth poverty alleviation . ’ s geographic focus eastern southern africa , well selected countries asia central america . focuses supporting small medium sized companies .',
 nan,
 0.87590879201889038]

In [44]:
vec1 = get_doc_vector('a startup that dedicate to green energy', w2v, dictionary, tfidf)
vec2 = get_doc_vector('business regrading green energy', w2v, dictionary, tfidf)
vec3 = get_doc_vector('companies specificly support low-income people', w2v, dictionary, tfidf)

In [46]:
vec1.dot(vec2)

0.83994347

In [11]:
vec1

array([-0.01663876, -0.00337446, -0.02395986, -0.04154515, -0.10667257,
        0.01662513, -0.01009543,  0.06161033,  0.05003018, -0.06181473,
       -0.02179295, -0.01145902, -0.01362746, -0.03405415,  0.00218448,
        0.00767476, -0.01452028, -0.0018257 ,  0.07659619, -0.04767921,
       -0.04837132, -0.02310556, -0.05546847, -0.00884663, -0.01575219,
        0.00686825, -0.02110741,  0.02846674, -0.00410359,  0.0267566 ,
        0.0219034 ,  0.01259031, -0.00153076,  0.0267643 ,  0.04200932,
       -0.05939888,  0.02810077, -0.01088257,  0.02167377,  0.05022477,
        0.00617732, -0.01705034,  0.04542316,  0.08962091, -0.01164754,
        0.0392498 , -0.02558717, -0.0163672 ,  0.06803226, -0.01166763,
        0.01418332,  0.04633227, -0.00154001,  0.00630221, -0.05789408,
        0.02684514,  0.03213207,  0.02908244, -0.05910822,  0.01098826,
       -0.04702628, -0.01257465,  0.02256784, -0.01471028], dtype=float32)

In [12]:
vec2

array([-0.01663876, -0.00337446, -0.02395986, -0.04154515, -0.10667257,
        0.01662513, -0.01009543,  0.06161033,  0.05003018, -0.06181473,
       -0.02179295, -0.01145902, -0.01362746, -0.03405415,  0.00218448,
        0.00767476, -0.01452028, -0.0018257 ,  0.07659619, -0.04767921,
       -0.04837132, -0.02310556, -0.05546847, -0.00884663, -0.01575219,
        0.00686825, -0.02110741,  0.02846674, -0.00410359,  0.0267566 ,
        0.0219034 ,  0.01259031, -0.00153076,  0.0267643 ,  0.04200932,
       -0.05939888,  0.02810077, -0.01088257,  0.02167377,  0.05022477,
        0.00617732, -0.01705034,  0.04542316,  0.08962091, -0.01164754,
        0.0392498 , -0.02558717, -0.0163672 ,  0.06803226, -0.01166763,
        0.01418332,  0.04633227, -0.00154001,  0.00630221, -0.05789408,
        0.02684514,  0.03213207,  0.02908244, -0.05910822,  0.01098826,
       -0.04702628, -0.01257465,  0.02256784, -0.01471028], dtype=float32)