In [19]:
import pickle

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', -1)
names =  pd.read_csv('sec__edgar_company_info.csv')
print('The shape: %d x %d' % names.shape)
names.head()

The shape: 663000 x 3


Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [2]:
from ftfy import fix_text
import re

In [3]:
def ngrams(string, n=3):
    string = fix_text(string) # fix text
    string = string.encode("ascii", errors="ignore").decode() #remove non ascii chars
    string = string.lower()
    chars_to_remove = [")","(",".","|","[","]","{","}","'"]
    rx = '[' + re.escape(''.join(chars_to_remove)) + ']'
    string = re.sub(rx, '', string)
    string = string.replace('&', 'and')
    string = string.replace(',', ' ')
    string = string.replace('-', ' ')
    string = string.title() # normalise case - capital at start of each word
    string = re.sub(' +',' ',string).strip() # get rid of multiple spaces and replace with a single
    string = ' '+ string +' ' # pad names for ngrams...
    string = re.sub(r'[,-./]|\sbd',r'', string)
    string = re.sub(r'[,-./]|\sbd',r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "$ LLC BD":')
print(ngrams('$ LLC.BD'))

All 3-grams in "$ LLC BD":
[' $ ', '$ L', ' Ll', 'Llc', 'lcb', 'cbd', 'bd ']


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

company_names = names['Company Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [5]:
ngrams('!J INC')

[' !J', '!J ', 'J I', ' In', 'Inc', 'nc ']

In [6]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntop
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [7]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)

SELFTIMED: 4370.280823707581


In [8]:
print(tf_idf_matrix.shape)

(663000, 37119)


In [9]:
dd=tf_idf_matrix.transpose()

In [10]:
print(dd.shape)

(37119, 663000)


In [11]:
print(matches.shape)


(663000, 663000)


In [29]:
#print(company_names)

In [12]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [13]:
matches_df = get_matches_df(matches, company_names,top=10000 )
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.head(5)


Unnamed: 0,left_side,right_side,similairity
6,& S MEDIA GROUP LLC,HH & S MEDIA GROUP LLC,0.834766
9,"'MKTG, INC.'",MKTG SERVICES INC,0.803908
12,(OURCROWD INVESTMENT IN MST) L.P.,OURCROWD (INVESTMENT IN MST - II) L.P.,0.95128
13,(OURCROWD INVESTMENT IN MST) L.P.,OURCROWD (INVESTMENT IN MST - III) LP,0.939638
170,1 800 MUTUALS ADVISOR SERIES,1 800 MUTUALS ADVISORS SERIES,0.947853


In [14]:
matches_df.shape

(2605, 3)

In [15]:
matches_df.sort_values(['similairity'], ascending=False).head(20)

Unnamed: 0,left_side,right_side,similairity
4728,60 EAST 42ND STREET ASSOCIATES,60 EAST 42ND STREET ASSOCIATES L.L.C.,0.9869
4726,60 EAST 42ND STREET ASSOCIATES L.L.C.,60 EAST 42ND STREET ASSOCIATES,0.9869
2682,250 WEST 57TH ST ASSOCIATES L.L.C.,250 WEST 57TH ST ASSOCIATES,0.986289
2684,250 WEST 57TH ST ASSOCIATES,250 WEST 57TH ST ASSOCIATES L.L.C.,0.986289
1266,1798 CONSUMER EQUITY LONG/SHORT FUND LP,1798 CONSUMER EQUITY LONG/SHORT FUND,0.985306
1268,1798 CONSUMER EQUITY LONG/SHORT FUND,1798 CONSUMER EQUITY LONG/SHORT FUND LP,0.985306
1870,2006 A-D DRILLING FUND XIII JOINT VENTURE,2006 A-D DRILLING FUND XII JOINT VENTURE,0.981423
1864,2006 A-D DRILLING FUND XII JOINT VENTURE,2006 A-D DRILLING FUND XIII JOINT VENTURE,0.981423
7585,"AAPCF III SERIES OF HCP PRIVATE EQUITY INVESTORS, LLC","AAPCF II SERIES OF HCP PRIVATE EQUITY INVESTORS, LLC",0.978797
7583,"AAPCF II SERIES OF HCP PRIVATE EQUITY INVESTORS, LLC","AAPCF III SERIES OF HCP PRIVATE EQUITY INVESTORS, LLC",0.978797


In [16]:
matches_df.sort_values(['similairity'], ascending=False).tail(20)

Unnamed: 0,left_side,right_side,similairity
7448,AAG HOLDING CO INC,"AAG HOLDING I, LLC",0.800815
7450,"AAG HOLDING I, LLC",AAG HOLDING CO INC,0.800815
1158,1740 TRUST #40-AD,1740 TRUST #40-LOCUST,0.800784
1207,1740 TRUST #40-LOCUST,1740 TRUST #40-AD,0.800784
2392,"21ST CENTURY ONCOLOGY OF ALABAMA, LLC","21ST CENTURY ONCOLOGY, LLC",0.80077
2412,"21ST CENTURY ONCOLOGY, LLC","21ST CENTURY ONCOLOGY OF ALABAMA, LLC",0.80077
8894,ABILITY INC.,ABILITY COM INC,0.80075
8891,ABILITY COM INC,ABILITY INC.,0.80075
9092,ABN AMRO CAPITAL FUNDING LLC VII,ABN AMRO CAPITAL FUNDING TRUST I,0.800743
9109,ABN AMRO CAPITAL FUNDING TRUST I,ABN AMRO CAPITAL FUNDING LLC VII,0.800743


In [17]:
matches_df.shape

(2605, 3)

In [20]:
with open('matched.pickle', 'wb') as f:
    pickle.dump(matches_df, f)