In [37]:
import re
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import sparse_dot_topn.sparse_dot_topn as ct
import time

In [11]:
import boto3
from sagemaker import get_execution_role

In [12]:
role = get_execution_role()

In [13]:
bucket='similarityengine1-sagemaker'

In [14]:
data_key = 'sec__edgar_company_info.csv'

In [15]:
data_location = 's3://{}/{}'.format(bucket, data_key)

In [45]:
pd.read_csv(data_location)

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189
5,6,& S MEDIA GROUP LLC,1447162
6,7,&TV COMMUNICATIONS INC.,1479357
7,8,"'MKTG, INC.'",886475
8,9,'OHANA LABS INC.,1703629
9,10,(OURCROWD INVESTMENT IN MST) L.P.,1599496


In [49]:
names =pd.read_csv(data_location)

In [50]:
print(names.shape)

(663000, 3)


In [51]:
def ngrams(string, n=3):
    #string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.replace('#', ' ')
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sbd',r'', string)
    string = string.title() # normalise case - capital at start of each word
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]



In [52]:
company_names = names['Company Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [53]:
def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [None]:
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 2690.1000900268555


In [55]:
print(tf_idf_matrix.shape)

(663000, 33884)


In [56]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [57]:
matches_df = get_matches_df(matches, company_names,top=10000 )
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.head(5)


Unnamed: 0,left_side,right_side,similairity
6,& S MEDIA GROUP LLC,HH & S MEDIA GROUP LLC,0.834769
9,"'MKTG, INC.'",MKTG SERVICES INC,0.803911
12,(OURCROWD INVESTMENT IN MST) L.P.,OURCROWD (INVESTMENT IN MST - II) L.P.,0.951291
13,(OURCROWD INVESTMENT IN MST) L.P.,OURCROWD (INVESTMENT IN MST - III) LP,0.939652
170,1 800 MUTUALS ADVISOR SERIES,1 800 MUTUALS ADVISORS SERIES,0.947798


In [58]:
matches_df.shape

(2595, 3)

In [59]:
matches_df.sort_values(['similairity'], ascending=False).head(20)

Unnamed: 0,left_side,right_side,similairity
4698,60 EAST 42ND STREET ASSOCIATES,60 EAST 42ND STREET ASSOCIATES L.L.C.,0.986894
4696,60 EAST 42ND STREET ASSOCIATES L.L.C.,60 EAST 42ND STREET ASSOCIATES,0.986894
2651,250 WEST 57TH ST ASSOCIATES,250 WEST 57TH ST ASSOCIATES L.L.C.,0.986283
2649,250 WEST 57TH ST ASSOCIATES L.L.C.,250 WEST 57TH ST ASSOCIATES,0.986283
1235,1798 CONSUMER EQUITY LONG/SHORT FUND,1798 CONSUMER EQUITY LONG/SHORT FUND LP,0.984524
1233,1798 CONSUMER EQUITY LONG/SHORT FUND LP,1798 CONSUMER EQUITY LONG/SHORT FUND,0.984524
1831,2006 A-D DRILLING FUND XII JOINT VENTURE,2006 A-D DRILLING FUND XIII JOINT VENTURE,0.981445
1837,2006 A-D DRILLING FUND XIII JOINT VENTURE,2006 A-D DRILLING FUND XII JOINT VENTURE,0.981445
7556,AAPCF II SERIES OF HCP PRIVATE EQUITY INVESTOR...,AAPCF III SERIES OF HCP PRIVATE EQUITY INVESTO...,0.978794
7558,AAPCF III SERIES OF HCP PRIVATE EQUITY INVESTO...,AAPCF II SERIES OF HCP PRIVATE EQUITY INVESTOR...,0.978794


In [60]:
import pickle

In [61]:
with open('matched.pickle', 'wb') as f:
    pickle.dump(matches_df, f)

In [67]:
with open('matched.pickle','rb') as f:
    loaded_obj = pickle.load(f)

In [68]:
print ('loaded_obj is', loaded_obj)

loaded_obj is                                               left_side  \
6                                   & S MEDIA GROUP LLC   
9                                          'MKTG, INC.'   
12                    (OURCROWD INVESTMENT IN MST) L.P.   
13                    (OURCROWD INVESTMENT IN MST) L.P.   
170                        1 800 MUTUALS ADVISOR SERIES   
172                       1 800 MUTUALS ADVISORS SERIES   
181   1 FINANCIAL MARKETPLACE SECURITIES LLC        ...   
183             1 FINANCIAL MARKETPLACE SECURITIES, LLC   
186                                     1 JOINT VENTURE   
187                                     1 JOINT VENTURE   
188                                     1 JOINT VENTURE   
189                                     1 JOINT VENTURE   
190                                     1 JOINT VENTURE   
191                                     1 JOINT VENTURE   
192                                     1 JOINT VENTURE   
193                                     1 