In [172]:
%matplotlib notebook
import pandas as pd
import matplotlib.pyplot as plt
from ipywidgets import *
from IPython.display import display
from IPython.html import widgets
import seaborn as sn
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from urlparse import urlparse
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate
#plt.style.use('ggplot')


In [135]:
#load data tables and models
path = '/Users/Lucy/Google Drive/MSDS/2016Fall/DSGA1006/Data'

#competition, investment files
orgs = pd.read_csv(path + '/unsupervised/trunc_clustering.csv',index_col = 0)
comps = pd.read_csv(path + '/csv_export/competitors.csv')
rounds = pd.read_csv(path + '/csv_export/funding_rounds.csv')
investors = pd.read_csv(path + '/csv_export/investors.csv')
investments = pd.read_csv(path + '/csv_export/investments.csv')

#load model
vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english',use_idf=True)

In [205]:
def url_validator(x):
    try:
        result = urlparse(x)
        if result.scheme == "" or result.netloc == "":
            return False
        else :
            return True
    except:
        return False
    
def get_investors(companies,index = None):
    if index == None:
        clean_companies = companies.copy()
    else:
        _comps = comps[comps.entity_uuid == companies.loc[index].company_uuid]
        clean_companies = companies[~companies.company_uuid.isin(_comps.competitor_uuid)]
    
    _rounds = rounds[rounds.company_uuid.isin(clean_companies.company_uuid)]
    _invests = investments[investments.funding_round_uuid.isin(_rounds.funding_round_uuid)]
    _investors = pd.DataFrame(_invests.groupby('investor_uuid').size().sort_values(ascending = False)).reset_index()
    _investors.columns = ['investor_uuid','count']
    rec_investors = investors[investors.uuid.isin(_investors.investor_uuid)]
    rec_investors = pd.merge(rec_investors,_investors, left_on = 'uuid',right_on = 'investor_uuid')
    rec_investors = rec_investors.sort_values('count',ascending = False)
    
    return rec_investors

In [218]:
cb_container = widgets.HBox()
display(cb_container)

desc = widgets.Textarea(
    placeholder='company name or description',
    description='Company website or long description: ',
    disabled=False,
    width = 400
)

button = widgets.Button(description="Submit")

items = [desc, button]

cb_container.children=[i for i in items]

text_input = desc.value

def handle_submit(sender):
    if url_validator(desc.value):
        orgs['lower_company_name'] = orgs.company_name.str.lower()
        index = orgs[orgs['lower_company_name'].str.contains(text_input.lower(),na=False)].index[0]
        X = vectorizer.fit_transform(orgs.description.astype(str))
        results = cosine_similarity(X[index:index+1], X).flatten()
        indices = results.argsort()[::-1][1:31]

    else:
        all_text = [desc.value] + list(orgs.description.astype(str))
        X = vectorizer.fit_transform(all_text)
        related_indices = cosine_similarity(X[0:1], X).flatten().argsort()[::-1][1:31]
        indices = [x - 1 for x in related_indices]
        index = None
    
    most_similar = orgs.iloc[indices]
    
    pd.set_option('display.max_colwidth', -1)
    display(most_similar[['company_name','short_description','founded_on','homepage_url']].head())
    
    print '\n'
    print '-----------------------------------------------------------------------------------------------------'
    print '\n'
    #pt_companies = PrettyTable(print_companies, center=True)
    
    #company_container = widgets.VBox()
    #display(company_container)
    #print_companies = []
    #for index, row in most_similar.head(5).iterrows():
    #    print_companies.append([widgets.HBox([row.company_name, row.short_description, 
    #                                          row.founded_on, row.homepage_url])])
    #company_container.children=[i for i in print_companies]
    #widgets.HBox([widgets.VBox([items[0], items[1]]), widgets.VBox([items[2], items[3]])])
    #sim_companies = widgets.VBox([HTML('<h2>Top 10 Similar Companies</h2>'),
    #                              HTML(company_results.style.set_table_attributes('class = "table"').render())])
    #print most_similar.head(10)
    try:
        rec_investors = get_investors(most_similar,index = index)
        display(rec_investors[['investor_name','country_code','investor_type','count']].head())
    except:
        print "no investor recommended"
    
    #pt_investor = PrettyTable(print_investors, center=True)
    
    '''investor_container = widgets.VBox()
    display(investor_container)
    print_investors = []
    for index, row in rec_investors.head(5).iterrows():
        print_investors.append([widgets.HBox([row.investor_name, row.domain, 
                                              row.country_code, row.investor_type,
                                             row.investment_count, row.count])])
    investor_container.children=[i for i in print_investors]'''

button.on_click(handle_submit)

Unnamed: 0,company_name,short_description,founded_on,homepage_url
27765,DocumentCloud,"Document Cloud, Inc., a nonprofit company",2009-01-01,http://www.documentcloud.org/home
17723,Qwilr,"Qwilr is a cloud-based replacement for Microsoft’s Word, Excel and Powerpoint",2014-01-01,http://qwilr.com
40519,Docracy,Docracy operates an open source website where users can sign and share legal documents curated by those who use them.,2011-01-01,http://www.docracy.com
58185,Mimeo,Innovator of content management & distribution. Print. Digital. Blended.,1998-01-01,http://www.mimeo.com
43548,doo,doo is a cloud-based solution with native apps for consumers and small businesses that allows them to access their documents.,2011-06-15,http://doo.net




-----------------------------------------------------------------------------------------------------




Unnamed: 0,investor_name,country_code,investor_type,count
2,Wayra,ESP,accelerator,8
22,Gemini Israel Ventures,ISR,venture_capital,4
62,Murphree Venture Partners,USA,venture_capital,3
15,Shasta Ventures,USA,venture_capital,3
63,Wolf Ventures,USA,venture_capital,3
