In [2]:
import pandas as pd
import numpy as np
import requests
import time

* define function to load a company list and filter for founding year, industry and size

In [3]:
def load_companies(path, filter_year=None, filter_industry=None, filter_size=None):
    df = pd.read_json(path)
    if filter_year:
        f = (str(x) for x in filter_year)
        df = df.loc[df['founded'].isin(f)]
    
    if filter_industry:
        df = df.loc[df['industry'].isin(filter_industry)]
    
    if filter_size:
        size_dict = {10:'1-10',
                     50:'11-50',
                     200:'51-200', 
                     500:'201-500', 
                     1000:'501-1000', 
                     5000:'1001-5000',
                     10000:'5001-10000', 
                     10001:'10001+'
                    }
        f = [size_dict[x] for x in filter_size]
        df = df.loc[df['size'].isin(f)]
    
    return df, np.array(df['id'])

* define function to call accountstory DM-API and create a pandas dataframe

In [4]:
def load_data(company_list, per_page, page):
    url = 'https://accountstory.com/api/v2/decision_makers?api_key=767f0c1a16a6a7efad56234025078d1d425a57de978f699441f9c4912a8531782cdb8d633cf61a14f36d4d9eb4c7838662096b3d424f'
    headers = {'Content-Type': 'application/x-www-form-urlencoded', 'Accept': 'application/json'}
    
    df = pd.DataFrame()
    n = len(company_list)
    for i, company in enumerate(company_list):
        try:
            if i % 100 == 0:
                print(i,'/',n,'---', len(df), 'data points ---', time.strftime('%H:%M'))

            payload = 'company_name=' + company + '&per_page=' + str(per_page) + '&page=' + str(page)
            req = requests.post(url,
                                headers=headers,
                                data=payload
                              )


            records = req.json()['records']
            df_temp = pd.DataFrame(records)
            if not df.empty:
                df = df.append(df_temp, ignore_index = True)
            else:
                df = df_temp
        except: pass
    print('done')
    return pd.DataFrame(df)

* load company dataframe and list (dataset created from https://www.peopledatalabs.com/company-dataset, filtered with MySql such that **only CA companies and only tech**)

In [5]:
company_df, company_list = load_companies('../data/company_list_CA.json', 
                            #filter_year=[2019], 
                            #filter_industry=['information technology and services'], 
                            #filter_size=[500,
                            #             1000]
                           )
company_df

Unnamed: 0,id,founded,industry,linkedin_url,size,website
0,zennify,2013,computer software,linkedin.com/company/zennify,51-200,zennify.com
1,systech-solutions,1993,information technology and services,linkedin.com/company/systech-solutions,201-500,systechusa.com
2,ixpsoft,2010,computer software,linkedin.com/company/ixpsoft,51-200,myopmo.com
3,continuous-computing,1998,telecommunications,linkedin.com/company/continuous-computing,201-500,ccpu.com
4,koncepteducationinc,2007,e-learning,linkedin.com/company/koncepteducationinc,51-200,keykoncept.com
...,...,...,...,...,...,...
8045,eurisko-mobility,2010,computer software,linkedin.com/company/eurisko-mobility,51-200,euriskomobility.com
8046,usendmoneytransfer,2005,financial services,linkedin.com/company/usendmoneytransfer,51-200,usend.com
8047,sumo-logic,2010,computer software,linkedin.com/company/sumo-logic,501-1000,sumologic.com
8048,california-service-bureau,,financial services,linkedin.com/company/california-service-bureau,51-200,californiaservicebureau.com


* load data

In [5]:
df = load_data(company_list, 50, 0) # get 50 people for each company

0 / 8050 --- 0 data points --- 16:28
100 / 8050 --- 178 data points --- 16:30
200 / 8050 --- 363 data points --- 16:32
300 / 8050 --- 545 data points --- 16:34
400 / 8050 --- 619 data points --- 16:36
500 / 8050 --- 799 data points --- 16:39
600 / 8050 --- 971 data points --- 16:41
700 / 8050 --- 1076 data points --- 16:43
800 / 8050 --- 1378 data points --- 16:45
900 / 8050 --- 1687 data points --- 16:48
1000 / 8050 --- 1908 data points --- 16:50
1100 / 8050 --- 2203 data points --- 16:52
1200 / 8050 --- 2387 data points --- 16:54
1300 / 8050 --- 2663 data points --- 16:56
1400 / 8050 --- 2914 data points --- 16:59
1500 / 8050 --- 3065 data points --- 17:01
1600 / 8050 --- 3235 data points --- 17:03
1700 / 8050 --- 3399 data points --- 17:05
1800 / 8050 --- 3564 data points --- 17:07
1900 / 8050 --- 3773 data points --- 17:10
2000 / 8050 --- 3941 data points --- 17:12
2100 / 8050 --- 4194 data points --- 17:14
2200 / 8050 --- 4421 data points --- 17:16
2300 / 8050 --- 4563 data points

In [7]:
df.to_json('../data/people_list_DM.json')