Loading investors data

In [2]:
import pandas as pd 

data = pd.read_json('Data/investors.json')

Function to clean the data and return a list of items

In [3]:
def clean(s):
    l = []
    if(',' in s):
        s = s.split(',')
    else:
        s = [s]
    for i in s:
        if(i.strip()):
            l.append(i.strip().lower())
    return l

All available stages from all the investors

In [4]:
stages = []
for i in list(data['Stage']):
    stages+=clean(i)
stages = list(set(stages))
print(stages)

['prototype/mvp', 'series++', 'post revenue', 'idea', 'scaling', 'idea or mvp', 'seed', 'series a', 'angel', 'pre-ipo', 'pre-seed', 'stage', 'series+']


All the Business models investors are interested in

In [6]:
models = []
for i in list(data['Type']):
    models+=clean(i)
models = list(set(models))
print(models)

['solo capitalist', 'family office', 'startup studio', 'multi-stage vc', 'incubator', 'angel', 'pe fund', 'public fund', 'accelerator', 'pre-seed fund', 'solo angel', 'other', 'corporate vc', 'vc', 'seed fund', 'full-time operator', 'investor_type', 'full-time angel', 'angel network', 'revenue-based']


In [7]:
industries = []
for i in range(len(list(data['Industry']))):
    if(data['Industry'][i]!=data['Overview'][i]):
        i = data['Industry'][i]
        industries+=clean(i)
industries = list(set(industries))
print(industries)

['wellness', 'personal finance', 'energy', 'dna/rna', 'security & investigations', 'crypto/blockchain', 'aviation & aerospace', 'broadcast media', 'logistics & supply chain', 'hardtech', 'higher education', 'transportation', 'productivity', 'public relations & communications', 'api', 'supply chain/logistics', 'telecommunications', 'internet and mobile', 'mental health', 'veterinary', 'renewables & environment', 'consumer electronics', 'climate/sustainability', 'iot', 'marketplace', 'market research', 'outsourcing/offshoring', 'entertainment', 'gaming', 'philanthropy', 'materials', 'hospital & health care', 'proteins', 'publishing', 'information technology & services', 'mobility', 'food & beverage', 'electrical/electronic manufacturing', 'pharmaceuticals', 'industry', 'retail', 'automotive', 'insuretech', 'freight', 'consumer', 'semiconductors', 'language learning', 'cloud', 'future of work', 'ar/vr', 'agritech', 'apparel & fashion', 'healthcare/medtech', 'construction', 'diagnostics', 

In [8]:
countries = []
for i in list(data['Countries']):
    countries+=clean(i)
countries = list(set(countries))
print(countries)

['usa', 'saudi arabia', 'sweden', 'finland', 'morocco', 'singapore', 'kenya', 'grenada', 'haiti', 'tuvalu', 'ukraine', 'guyana', 'africa', 'hungary', 'saint vincent and the grenadines', 'kazakhstan', 'cuba', 'tajikistan', 'gambia', 'antigua and barbuda', 'colombia', 'syria', 'mauritania', 'chile', 'sant lucia', 'bangladesh', 'malawi', 'maldives', 'somalia', 'ethiopia', 'oceania', 'belgium', 'equatorial guinea', 'guinea', 'india', 'marshall islands', 'malaysia', 'monaco', 'poland', 'bhutan', 'timor-leste', 'nauru', 'serbia', 'kuwait', 'new zealand', 'lithuania', 'yemen', 'cabo verde', 'azerbaijan', 'southeast asia', 'cyprus', 'malta', 'kyrgyzstan', 'sierra leone', 'uk', 'zimbabwe', 'global', 'italy', 'australia', 'cameroon', 'mauritius', 'estonia', 'slovenia', 'zambia', 'spain', 'philippines', 'bahrain', 'san marino', 'tunisia', 'cambodia', 'nigeria', 'thailand', "côte d'ivoire", 'botswana', 'paraguay', 'lesotho', 'central african republic', 'montenegro', 'canada', 'andorra', 'portugal'

Finding Scores for each field

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def match_overview(desc,inv_data):
    scores = []
    tfidf = TfidfVectorizer()
    for i in inv_data['Overview']:
        text = [desc,i]
        tfidf_emb = tfidf.fit_transform(text).toarray()
        cos_sim = cosine_similarity(tfidf_emb[0].reshape(1,-1),tfidf_emb[1].reshape(1,-1))[0][0]
        scores.append(int(cos_sim*100)+1)
    return scores

def match_industry(industry,inv_data):
    scores = []
    for i in inv_data['Industry']:
        score = 0
        inds = clean(i)
        for j in industry:
            if(j in inds):
                score+=15
        scores.append(score)
    return scores

def match_stage(stage,inv_data):
    scores = []
    for i in inv_data['Stage']:
        score = 0
        st = clean(i)
        for j in stage:
            if(j in st):
                score+=10
        scores.append(score)
    return scores

def match_region(region,inv_data):
    scores = []
    for i in inv_data['Countries']:
        score = 0
        country = clean(i)
        for j in region:
            if(j in country):
                score+=5
        scores.append(score)
    return scores

def match_model(model,inv_data):
    scores = []
    for i in inv_data['Type']:
        score = 0
        mod = clean(i)
        for j in model:
            if(j in mod):
                score+=5
        scores.append(score)
    return scores

def match_funds(funds,inv_data):
    scores = []
    for i in inv_data['Cheque_range']:
        score = 0
        i = i.replace('$',"").lower().replace('k','000').replace('m','000000').split('-')
        if(len(i)==1):
            min_fund = int(i[0].strip())
            max_fund = min_fund
        else:
            min_fund,max_fund = int(i[0].strip()),int(i[1].strip())
        if min_fund <= funds <= max_fund:
            score+=30
        scores.append(score)
    return scores

Summing up all the Scores from all fields and finding best matches

In [10]:
def match(founders_input,data):
    s_over,s_in,s_st,s_re,s_mo,s_fu = match_overview(founders_input['desc'],data),match_industry(founders_input['industry'],data),match_stage(founders_input['stage'],data),match_region(founders_input['region'],data),match_model(founders_input['model'],data),match_funds(founders_input['funds'],data)
    scores = []
    for i in range(len(s_over)):
        res = s_over[i]+s_in[i]+s_st[i]+s_re[i]+s_mo[i]+s_fu[i]
        scores.append([i,res])
    scores = sorted(scores,key=lambda x:x[1],reverse=True)
    indices = []
    rank = []
    for i in scores[:30]:
        indices.append(i[0])
        rank.append(i[1])
    df = data.loc[indices]
    df['Score'] = rank
    return df

In [12]:
founders_input = {'name':'Sample',
                  'desc':'An AI based startup, focused on LLMs',
                  'industry':['information technology & services','ai and ml','api','cloud','saas'],
                  'stage':['seed','idea'],
                  'region':['india'],
                  'model':['seed fund'],
                  'funds':55000}
df = match(founders_input,data)

In [13]:
df.to_csv(f"Results/list_{founders_input['name']}.csv",index=False)