In [7]:
from bs4 import BeautifulSoup
import requests
import multiprocessing
import numpy as np
import pandas as pd
from gensim import models
from gensim import corpora
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Phrases
from gensim.models.fasttext import FastText
from gensim.models.wrappers.fasttext import FastText as FT_wrapper
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from random import shuffle
import time
import re
import pylab as pl
from ipywidgets import FloatProgress
from IPython import display
import matplotlib.pyplot as plt
%matplotlib notebook

In [4]:
w2v = KeyedVectors.load_word2vec_format("../w2v/model/wiki-news-300d-1M.vec")
# w2v = KeyedVectors.load("../w2v/model/fasttext_w2v_vector_64")

In [46]:
def get_urls_from_url(main_url):
    resp = requests.get(main_url)
    soup = BeautifulSoup(resp.content, 'html.parser')
    urls = []
    links = soup.find_all('a')
    for url in links:
        try:
            url = url.attrs['href']
            if len(url) > 5:
                urls.append(url)
        except:
            pass
    return urls

def get_texts_from_resp(resp):
    # parse the web response
    soup = BeautifulSoup(resp.content, 'html.parser')
    # find and filter texts
    print("These are texts under", resp.url)
    texts = soup.find_all('p')
    print("number of items grabed are", len(texts))
    texts = [text for text in texts if len(text.text) > 100]
    print("number of items after filtering", len(texts))
    # output texts
    for text in texts:
        #print(text.text)
        yield text.text

def url_is_valid(url):
    try:
        resp = requests.get(url)
        assert resp.status_code == 200
        return resp
    except:
        return False

def url_compare(url_target, url_income):
    n_same_letter = 0.0
    # delete all http or https 
    if url_target[4] == 's':
        url_target = url_target[5:]
    else:
        url_target = url_target[4:]
    if url_income[4] == 's':
        url_income = url_income[5:]
    else:
        url_income = url_income[4:]
    # check similarity
    min_len = min(len(url_target), len(url_income))
    for i in range(min_len-1):
        if url_target[i] == url_income[i]:
            n_same_letter += 1
        else:
            break
    return n_same_letter

def get_text_from_url_with_check(url, main_url):
    resp = url_is_valid(url)
    if not resp:
        url = main_url + url
        resp = url_is_valid(url)
        if not resp:
            print("url:", url, "invalid")
            return []
    # double check if the url is visited
    if resp.url != url: # meaning its redirected
        print('the url is redirected, try https\n')
        # try https
        url = url[:4] + 's' + url[4:]
        resp = url_is_valid(url)
        if resp:
            if resp.url == url:
                print('try succeeded')
        else:
            return []
    # check if url is the child or sibling of main_url
    if url_compare(main_url, resp.url) < 10: # to avoid http://www.
        print('\nurl:', resp.url, 'might be irrelevent to', main_url, 'quit visiting\n')
        return []
    text_data = []
    for text in get_texts_from_resp(resp):
        text_data.append(text)
    return text_data

In [91]:
class Searcher():
    def __init__(self, w2v=None):
        # load w2v modle
        if w2v is None:
            print("start loading w2v, this might take a while")
            self._w2v = KeyedVectors.load_word2vec_format("../w2v/model/wiki-news-300d-1M.vec")
        else:
            self._w2v = w2v
        
        # get and process database
        try:
            self._database = pd.read_csv('crawled_database.csv').iloc[:, [1, 2, 3]]
            print("load crawled database successful")
        except:
            # if no crawled database given
            # load the dataset : including only each company's name, url and summary
            print('fail to load crawled database')
            self._database = pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, [1, 5, 6]]
            self.crawl_database()
        self.process_database()
        
    def process_database(self):
        # 1: company name, 5: company website, 6: company manual desc
        raw_texts = []
        # preprocess all the text data and remove any row without any useful data, and segment each word
        drop_list = []
        for row in self._database.itertuples():
            if not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str):
                # check if the row has data
                drop_list.append(row[0])
            else:
                # process text data of both manually summarized or crawled data
                tmp_text = []
                for col in [2, 3]:
                    real_col = col - 1
                    text = row[col]
                    if type(text) is str:
                        text = self.word_tokenize_string(text)
                        self._database.iloc[row[0], real_col] = text
                        tmp_text.append(text)
                # merge texts of same company
                tmp_text = '    '.join(tmp_text)
                raw_texts.append(tmp_text)
                
        # drop all the rows that do not have essential data
        self._database.drop(drop_list, inplace=True)
        # create similarity col for similarity search use
        self._database = self._database.assign(similarity=np.zeros(len(self._database)))
        
        # use the raw_texts to generate tfidf model
        self._tfidf, self._dictionary = self.get_tfidf_and_dictionary(raw_texts)
        
    def crawl_database(self):
        for row in self._database.itertuples():
            if not (not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str)):
                # process each website and replace web address with texts crawled
                url = row[2]
                texts = self.get_text_from_url_and_its_children(url)
                if not texts:
                    # if cannot access url, replace url with Nan
                    self._database.iloc[row[0], 1] = np.nan
                else:
                    # replace the url with the crawled texts
                    texts = '   '.join(texts)
                    self._database.iloc[row[0], 1] = texts
        
    def save_database(self):
        self._database.to_csv('crawled_database.csv')
        print("database save successful")
    
    def update_similarity(self, input_text, col=1):
        # get input text vector
        input_text_vector = self.get_doc_vector(input_text)
        i = 0
        for row in self._database.itertuples():
            row_text_vector = self.get_doc_vector(row[col])
            similarity = input_text_vector.dot(row_text_vector)
            self._database.iloc[i, -1] = similarity
            i += 1
        self._database = self._database.sort_values(by='similarity', ascending=False)
        return self._database

    def get_doc_vector(self, text):
        tokens = list(self._dictionary.token2id)
        # convert any unknown word to known word
        new_text = []
        for word in text.split():
            if word in tokens:
                new_text.append(word)
            elif word in w2v: # replace the unknow word with the most similar word in tokens of dictionary
                new_text.append(self._w2v.most_similar_to_given(word_list=tokens, w1=word))

        # start to calculate vector using tfidf weighted word vector sum
        # get tfidf weight
        tokenized_text = [self._dictionary.doc2bow(new_text)]
        tfidf_text = self._tfidf[tokenized_text][0]
        # sum weighted word vectors
        sum_vector = self._w2v['happy'] * 0 # get the size of the word vector
        for word_id, weight in tfidf_text:
            word = self._dictionary[word_id]
            sum_vector += self._w2v[word] * weight
        if sum_vector.any():
            sum_vector /= np.sqrt(sum_vector.dot(sum_vector)) # normalize the vector
            
        return sum_vector
    
    def word_tokenize_string(self, text):
        stop_words = get_stop_words('en') # get too frequent word
        text = text.replace('\r', ' ').replace('\n', ' ') # remove symbols
        text = re.sub(r"http\S+", "", text) # remove urls
        # remove any word that present too frequently or cannot be converted to word vector
        text = [word for sent in sent_tokenize(text.lower()) for word in word_tokenize(sent) \
                if not word in stop_words and word in self._w2v]
        return ' '.join(text)
    
    @staticmethod
    def get_tfidf_and_dictionary(texts):
        # get dictionary of texts
        texts = [text.split() for text in texts]
        dictionary = corpora.Dictionary(texts)

        # get tfidf ranking model
        tokenized_texts = [dictionary.doc2bow(text) for text in texts]
        tfidf = models.TfidfModel(tokenized_texts)

        return tfidf, dictionary
    
    def get_text_from_url_and_its_children(self, main_url):
        print("starting to crawl main url: ", main_url)
        # check validity of main_url
        resp = url_is_valid(main_url)
        if not resp:
            print("main_url is not valid")
            return False

        print("\nstarting to crawl all its children")
        # grab all urls in this web page
        urls = [main_url]
        urls.extend(get_urls_from_url(main_url))
        urls = list(set(urls)) # remove duplicated urls
        print("\n\nthese are the children links we crawled")
        print(urls, "\n")
        # grab all texts in each urls asynchronously
        # argmumentize urls
        urls = [(url, main_url) for url in urls]
        with multiprocessing.Pool(processes=24) as pool:
            text_data = pool.starmap(get_text_from_url_with_check, urls) 
            # try terminating hung jobs
        text_data = [text for text in text_data if len(text_data) > 0] # remove empty returns
        text_data = [text for text_list in text_data for text in text_list] # get list elements to str
        return text_data

In [92]:
searcher = Searcher(w2v=w2v)
# there might be main_url + url is not valid, because url and main_url has overlaps, or main_url is not the root
# must provide root url, or 
# use overlaps to do intelligent main_url + url

load crawled database successful


In [93]:
input_text = "new start up aiming at low income customers, dedicated in green energy"

In [94]:
%time searcher.update_similarity(input_text)

CPU times: user 49.1 s, sys: 50.5 ms, total: 49.2 s
Wall time: 49.2 s


Unnamed: 0,Organisation,Website,Description,similarity
1,Alliance for a green revolution in Africa (AGRA),division operates understanding agricultural t...,agra aims invest projects can measurable impac...,0.810514
100,Southern African Impact Investing Network (SAIIN),"move immediately right hand lane , turn right ...",aims promote concept practice impact investing...,0.741054
102,Global Alliance for Clean Cookstoves,united nations sets inspiring ambition sustain...,global alliance clean cookstoves ( alliance ) ...,0.735426
9,Business/Partners Investing in Entrepreneurs (...,"know investing , entrepreneur , will help crea...",specialist risk finance company provides custo...,0.724118
99,Global Impact Investing Network (GIIN),,not-for-profit organization dedicated increasi...,0.720318
0,The Africa Enterprise Challenge Fund (AECF),"rt . hon lord boateng barrister , former briti...",challenge fund aims reduce poverty supporting ...,0.718582
24,Fund for Developing,,. invests enterprises low income countries pro...,0.712507
101,Global Impact Investing Rating System (GIIRS),,rating system impact measurement impact invest...,0.707554
77,The Investment Fund For Health In Africa,,private equity fund dedicated small medium siz...,0.699739
16,Persistent Energy Capital,,pec invests incubates businesses commercial ap...,0.695284
