In [10]:
import numpy as np
import pandas as pd
from gensim import models
from gensim import corpora
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Phrases
from gensim.models.fasttext import FastText
from gensim.models.wrappers.fasttext import FastText as FT_wrapper
from gensim.models.word2vec import LineSentence
from gensim.corpora import Dictionary
from nltk.tokenize import sent_tokenize, word_tokenize
from stop_words import get_stop_words
from random import shuffle
import time
import re
import pylab as pl
from ipywidgets import FloatProgress
from IPython import display
import matplotlib.pyplot as plt
%matplotlib notebook

In [11]:
def rescue_code(function):
    import inspect
    get_ipython().set_next_input("".join(inspect.getsourcelines(function)[0]))

In [12]:
def get_database(w2v):
    # load the dataset and all the text data
    database = pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, [1, 6, 9]]
    raw_texts = []
    # preprocess all the text data and remove any row without any useful data, and segment each word
    drop_list = []
    for row in database.itertuples():
        if not type(row[1]) is str or (not type(row[2]) is str and not type(row[3]) is str):
            drop_list.append(row[0])
        else:
            for col in [2, 3]:
                real_col = col - 1
                text = row[col]
                if type(text) is str:
                    text = word_tokenize_string(text, w2v)
                    text = ' '.join(text)
                    database.iloc[row[0], real_col] = text
                    raw_texts.append(text)
    database.drop(drop_list, inplace=True)
    database = database.assign(similarity=np.zeros(len(database)))

    return database, raw_texts

def get_tfidf_and_dictionary(texts):
    # get dictionary of texts
    texts = [text.split() for text in texts]
    dictionary = corpora.Dictionary(texts)
    
    # get tfidf ranking model
    tokenized_texts = [dictionary.doc2bow(text) for text in texts]
    tfidf = models.TfidfModel(tokenized_texts)
    
    return tfidf, dictionary

def word_tokenize_string(text, w2v):
    stop_words = get_stop_words('en')
    text = text.replace('\r', ' ').replace('\n', ' ')
    text = re.sub(r"http\S+", "", text)
    text = [word for sent in sent_tokenize(text.lower()) for word in word_tokenize(sent) if not word in stop_words and word in w2v]
    return text

def update_similarity(w2v, dictionary, tfidf, input_text, database):
    # get input text vector
    input_text_vector = get_doc_vector(input_text, w2v, dictionary, tfidf)
    i = 0
    for row in database.itertuples():
        row_text_vector = get_doc_vector(row[2], w2v, dictionary, tfidf)
        similarity = input_text_vector.dot(row_text_vector)
        database.iloc[i, -1] = similarity
        i += 1
    database = database.sort_values(by='similarity', ascending=False)
    return database

def get_doc_vector(text, w2v, dictionary, tfidf):
    tokens = list(dictionary.token2id)
    # convert any unknown word to known word
    new_text = []
    for word in text.split():
        if word in tokens:
            new_text.append(word)
        elif word in w2v: # replace the unknow word with the most similar word in tokens of dictionary
            new_text.append(w2v.most_similar_to_given(word_list=tokens, w1=word))
    
    # start to calculate vector using tfidf weighted word vector sum
    # get tfidf weight
    tokenized_text = [dictionary.doc2bow(new_text)]
    tfidf_text = tfidf[tokenized_text][0]
    # sum weighted word vectors
    sum_vector = w2v['happy'] * 0 # get the size of the word vector
    for word_id, weight in tfidf_text:
        word = dictionary[word_id]
        sum_vector += w2v[word] * weight
    sum_vector /= np.sqrt(sum_vector.dot(sum_vector)) # normalize the vector
    
    return sum_vector

In [13]:
w2v = KeyedVectors.load_word2vec_format("../w2v/model/wiki-news-300d-1M.vec")
# w2v = KeyedVectors.load("../w2v/model/fasttext_w2v_vector_64")

KeyboardInterrupt: 

In [14]:
database, texts = get_database(w2v)
tfidf, dictionary = get_tfidf_and_dictionary(texts)

In [15]:
pd.read_csv("../input/InvestData_2017-Nov-22_0101.csv").iloc[:, :]

Unnamed: 0,No,Organisation,Type_1,Type_2,Region,Website,Description,Relevant_Products_and_Services,SectorsIndustries,Funding_details,Requirements__Eligibility,Examples,Application,Contact,Additional_Information1,Additional_Information2,Additional_Information3
0,0.0,filler,,,,,,,,,,,,,,,
1,1.0,The Africa Enterprise Challenge Fund (AECF),,,Africa,http://www.aecfafrica.org/,The AECF is an Africa-based challenge fund tha...,,,1) Challenge Fund model where funding is award...,"Vary from competition to competition, but gene...",M-Kopa\r,General Procedure\r\n\r\nBefore:\r\n1) Submit ...,"The AECF Limited,\r\n\r\nWest End Towers, Kanj...",,,
2,2.0,Alliance for a green revolution in Africa (AGRA),,,Africa,http://agra.org/grants/,AGRA aims to invest in projects that can have ...,,,No much information provided - companies are u...,,,,Nairobi\r\n\r\nTel: +254 (20) 3675 000 / +254 ...,,,
3,3.0,Global Innovation Fund,,,Africa,http://www.globalinnovation.fund,A government-sponsored non-profit fund in Lon...,,,"Amount: $230,000 max for pilot project\r\n* Pi...",More details: http://www.globalinnovation.fund...,,Accept applications all year round\r\n\r\n1) I...,http://www.globalinnovation.fund/contact-us,,,
4,4.0,Acumen Fund,,,Africa,http://acumen.org/,A charity organisation providing funding for e...,,,"Investment capital in the range of $0.25M-$3M,...",Social responsibility and demonstrated sustain...,M-Kopa\r\n\r\nD.Light (Global company; african...,Accept applications all year round\r\n\r\nMore...,Do not accept contacts prior to submission,,,
5,5.0,Bamboo Finance,,,Africa,http://www.bamboocp.com/,Bamboo Finance is a commercial private equity ...,,,,The company provides essential services afford...,,"Bamboo Finance Africa\r\n7th Floor, Purshottam...",,,,
6,6.0,African Development Fund,,,Africa,http://www.afdb.org/en/about-us/corporate-info...,The ADF contributes to the promotion of econom...,,,Areas of funding: \r\nThe African Development ...,,,East Africa Regional Resource Center (EARC)\r\...,,,,
7,7.0,Willow Impact,,,Africa,http://www.willowimpact.com/,An impact investment firm that manages and adv...,,,,"Geographic areas: Eastern Africa, the Middle E...",,http://www.willowimpact.com/contact-us/email-u...,,,,
8,8.0,Vista Ventures Social Impact Fund,,,Africa,http://www.vistaventures.com/,A California-based fund to provide education a...,,,Seek early stage companies that wish to raise ...,"By early stage, we expect that applicant compa...",,Submit:\r\n1) Business plan (or equivalent)\r\...,,,,
9,9.0,Grayghost ventures,,,Africa,http://www.grayghostventures.com/indexa.html,Seeks to eliminate poverty and strengthen comm...,,,Stage: Early-stage venture capital \r\n\r\nLoc...,Our investment approach prioritizes social ven...,M-Kopa,Submit an executive summary to\r\ninfo@graygho...,Headquarters:\r\nGray Ghost Ventures\r\n2200 C...,,,


In [16]:
input_text = "new start up aiming at low income customers, dedicated in green energy"

In [17]:
search_output = update_similarity(w2v, dictionary, tfidf, input_text, database)
list(search_output.iloc[1, :])

['Fund for Developing',
 '. invests enterprises low income countries promote business development contribute economic growth poverty alleviation . ’ s geographic focus eastern southern africa , well selected countries asia central america . focuses supporting small medium sized companies .',
 nan,
 0.87590879201889038]

In [44]:
vec1 = get_doc_vector('a startup that dedicate to green energy', w2v, dictionary, tfidf)
vec2 = get_doc_vector('business regrading green energy', w2v, dictionary, tfidf)
vec3 = get_doc_vector('companies specificly support low-income people', w2v, dictionary, tfidf)

In [46]:
vec1.dot(vec2)

0.83994347

In [11]:
vec1

array([-0.01663876, -0.00337446, -0.02395986, -0.04154515, -0.10667257,
        0.01662513, -0.01009543,  0.06161033,  0.05003018, -0.06181473,
       -0.02179295, -0.01145902, -0.01362746, -0.03405415,  0.00218448,
        0.00767476, -0.01452028, -0.0018257 ,  0.07659619, -0.04767921,
       -0.04837132, -0.02310556, -0.05546847, -0.00884663, -0.01575219,
        0.00686825, -0.02110741,  0.02846674, -0.00410359,  0.0267566 ,
        0.0219034 ,  0.01259031, -0.00153076,  0.0267643 ,  0.04200932,
       -0.05939888,  0.02810077, -0.01088257,  0.02167377,  0.05022477,
        0.00617732, -0.01705034,  0.04542316,  0.08962091, -0.01164754,
        0.0392498 , -0.02558717, -0.0163672 ,  0.06803226, -0.01166763,
        0.01418332,  0.04633227, -0.00154001,  0.00630221, -0.05789408,
        0.02684514,  0.03213207,  0.02908244, -0.05910822,  0.01098826,
       -0.04702628, -0.01257465,  0.02256784, -0.01471028], dtype=float32)

In [12]:
vec2

array([-0.01663876, -0.00337446, -0.02395986, -0.04154515, -0.10667257,
        0.01662513, -0.01009543,  0.06161033,  0.05003018, -0.06181473,
       -0.02179295, -0.01145902, -0.01362746, -0.03405415,  0.00218448,
        0.00767476, -0.01452028, -0.0018257 ,  0.07659619, -0.04767921,
       -0.04837132, -0.02310556, -0.05546847, -0.00884663, -0.01575219,
        0.00686825, -0.02110741,  0.02846674, -0.00410359,  0.0267566 ,
        0.0219034 ,  0.01259031, -0.00153076,  0.0267643 ,  0.04200932,
       -0.05939888,  0.02810077, -0.01088257,  0.02167377,  0.05022477,
        0.00617732, -0.01705034,  0.04542316,  0.08962091, -0.01164754,
        0.0392498 , -0.02558717, -0.0163672 ,  0.06803226, -0.01166763,
        0.01418332,  0.04633227, -0.00154001,  0.00630221, -0.05789408,
        0.02684514,  0.03213207,  0.02908244, -0.05910822,  0.01098826,
       -0.04702628, -0.01257465,  0.02256784, -0.01471028], dtype=float32)