In [58]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import *
from IPython.display import display
from IPython.html import widgets
import seaborn as sn
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from urlparse import urlparse
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate
import qgrid
from IPython.display import clear_output

qgrid.nbinstall(overwrite=True)


In [187]:
#test inputs:

#company description:
'''Organizations protect the sensitive documents they are aware of, 
but out-of-sight documents are left unprotected and may fall into the hands of insiders and hackers. 
DocAuthority solves this by automatically discovering and accurately identifying unprotected sensitive documents, 
thus enabling a broad yet business-friendly security policy.'''

#company name: airbnb

'Organizations protect the sensitive documents they are aware of, \nbut out-of-sight documents are left unprotected and may fall into the hands of insiders and hackers. \nDocAuthority solves this by automatically discovering and accurately identifying unprotected sensitive documents, \nthus enabling a broad yet business-friendly security policy.'

In [2]:
#load data tables and models
path = '/Users/Lucy/Google Drive/MSDS/2016Fall/DSGA1006/Data'

#competition, investment files
orgs = pd.read_csv(path + '/unsupervised/trunc_clustering.csv',index_col = 0)
comps = pd.read_csv(path + '/csv_export/competitors.csv')
rounds = pd.read_csv(path + '/csv_export/funding_rounds.csv')
investors = pd.read_csv(path + '/csv_export/investors.csv')
investments = pd.read_csv(path + '/csv_export/investments.csv')

#load model
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000, min_df=2, stop_words='english',use_idf=True)

In [194]:
def url_validator(x):
    try:
        result = urlparse(x)
        if result.scheme == "" or result.netloc == "":
            return False
        else :
            return True
    except:
        return False
    
def get_investors(companies,index = None):
    def _investors(companies):
        _rounds = rounds[rounds.company_uuid.isin(companies.company_uuid)]
        _invests = investments[investments.funding_round_uuid.isin(_rounds.funding_round_uuid)]
        _investors = pd.DataFrame(_invests.groupby('investor_uuid').size().sort_values(ascending = False)).reset_index()
        _investors.columns = ['investor_uuid','relevant_investments']
        rec_investors = investors[investors.uuid.isin(_investors.investor_uuid)]
        rec_investors = pd.merge(rec_investors,_investors, left_on = 'uuid',right_on = 'investor_uuid')
        rec_investors = rec_investors.sort_values('relevant_investments',ascending = False)
        
        return rec_investors
        
    if index == None:
        clean_companies = companies.copy()
        rec_investors = _investors(clean_companies)
        comp_investors = None
        
    else:
        _comps = comps[comps.entity_uuid == orgs.iloc[index].company_uuid]
        clean_companies = companies[~companies.company_uuid.isin(_comps.competitor_uuid)]
        comp_companies =  orgs[orgs.company_uuid.isin(_comps.competitor_uuid)]
        
        rec_investors = _investors(clean_companies)
        comp_investors = _investors(comp_companies)
        rec_investors = rec_investors[~rec_investors.uuid.isin(comp_investors.uuid)]    
    
    return rec_investors, comp_investors

In [193]:
title_1 = widgets.HTML(value = '<h2>Search Criteria</h2>')
display(title_1)
title_input_1 = widgets.HTML(value = '<h4>Or Enter company description</h4>')
title_input_3 = widgets.HTML(value = '<h4>Enter company name</h4>')
title = widgets.Text(
    placeholder='company name',
    disabled=False,
    width = 800)
desc = widgets.Textarea(
    placeholder='company name or description',
    disabled=False,
    width = 800)
title_input_2 = widgets.HTML(value = '<h4>Select max number of companies to return</h4>')
max_companies = widgets.IntSlider(
    value=30,
    min=5,
    max=100,
    step=1,
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='i',
    slider_color='white')
submit = widgets.Button(description="Submit")
clear = widgets.Button(description="Clear")
buttons = widgets.HBox(children = [submit, clear])
name_box = widgets.VBox(children = [title_input_3,title])
desc_box = widgets.VBox(children = [title_input_1,desc])
limit_box = widgets.VBox(children = [title_input_2, max_companies])
desc_box.layout.padding = 100
inputs.margin=100
input_container = widgets.Box(children=[name_box,desc_box, limit_box])
display(input_container)
display(buttons)

companies_container = widgets.Box()
display(companies_container)
investors_container = widgets.Box()
display(investors_container)
comp_investors_container = widgets.Box()
display(comp_investors_container)

def handle_submit(sender):
    if title.value != "":
        orgs['lower_company_name'] = orgs.company_name.str.lower()
        reset_orgs = orgs.reset_index()
        index_value = reset_orgs[reset_orgs['lower_company_name'].str.contains(title.value.lower(),na=False)].index[0]
        X = vectorizer.fit_transform(orgs.description.astype(str))
        results = cosine_similarity(X[index_value:index_value+1], X).flatten()
        indices = results.argsort()[::-1][1:31]

    elif desc.value != "":
        all_text = [desc.value] + list(orgs.description.astype(str))
        X = vectorizer.fit_transform(all_text)
        related_indices = cosine_similarity(X[0:1], X).flatten().argsort()[::-1][1:31]
        indices = [x - 1 for x in related_indices]
        index_value = None
    
    most_similar = orgs.iloc[indices]

    title_2 = widgets.HTML(value = '<h2>Similar Companies</h2>')
    #pd.set_option('display.max_colwidth', -1)
    company_output = most_similar[['company_name','short_description','founded_on','homepage_url']]
    _companies = qgrid.QGridWidget(df=company_output)
    
    companies_container.children=[title_2, _companies]
    
    try:
        rec_investors, comp_investors = get_investors(most_similar,index = index_value)
        investor_output = rec_investors[['investor_name','country_code','investor_type','relevant_investments']]
        title_3 = widgets.HTML(value = '<h2>Recommended Investors</h2>')
        _investors = qgrid.QGridWidget(df=investor_output)
        investors_container.children = [title_3, _investors]
        
        if title.value != "":
            competitor_investor_output = comp_investors[['investor_name',
                                                'country_code','investor_type','relevant_investments']]
            title_4 = widgets.HTML(value = '<h2>Investors invested in competitive companies</h2>')
            _comp_investors = qgrid.QGridWidget(df=competitor_investor_output)
            comp_investors_container.children = [title_4, _comp_investors]
    except:
        print "no investor recommended"

submit.on_click(handle_submit)


no investor recommended
