### Fuzzy String Matcher (Basic)

In [None]:
import pandas as pd
import numpy as np
from string_grouper import match_strings, match_most_similar, \
	group_similar_strings, compute_pairwise_similarities, \
	StringGrouper
import cleanco

In [None]:
## directory info
rootDataPath = 'INSERT YOUR FILEPATH HERE'

In [None]:
## file read
matchlist = pd.read_csv(f'{rootDataPath}company_matches_pre_cleaning.csv')

In [None]:
## establish reference list for all matches to be matched against
ref_list = matchlist[['companyId','companyName']].drop_duplicates().copy()

## remove the following words from the company names
pattern = '|'.join(['-','_','technologies','technology','pharmaceuticals','partners', 'international',
'financial services','management','consulting','industries','entertainment', 'enterprises','com'
'networks','life sciences','capital markets','communications','resources','healthcare','therapeutics','health care'])

## replace those words
ref_list['dehyphenName'] = ref_list.companyName.str.replace(pattern,' ')

## apply company name cleaning
ref_list['cleanMain'] = ref_list.apply(lambda x: cleanco.basename(x.dehyphenName), axis=1)

In [None]:
## establish list of companies to be matched (the reference list is where we'll be matching them)
to_match_list = matchlist[['companyName2']].drop_duplicates().copy().astype(str)
to_match_list['dehyphenName'] = to_match_list.companyName2.str.replace(pattern, ' ')
to_match_list['cleanToMatch'] = to_match_list.apply(lambda x: cleanco.basename(x.dehyphenName), axis=1)

In [None]:
## break apart the match list to speed things up into blocks of 600k
matchlists_chunks = [to_match_list.cleanToMatch[0:600000].copy(), to_match_list.cleanToMatch[600000:1200000].copy(), to_match_list.cleanToMatch[1200000:].copy()]
matchlists_chunks = [item.astype(str) for item in matchlists_chunks]

In [None]:
## test to check similarity threshold. going with 0.67 for now.
matches1 = match_strings(ref_list.cleanMain, matchlists_chunks[0], n_blocks='auto', min_similarity=0.67)
matches1.sort_values(by='similarity', ascending=True).head(20)

In [None]:
## concatenate all of the matches
matches = pd.concat([match_strings(ref_list.cleanMain, item, n_blocks='auto', min_similarity=0.67) for item in matchlists_chunks], axis=0, ignore_index=True)
matches.sort_values(by='similarity', ascending=False)

In [90]:
def retrieveData():
    url = "https://www.sec.gov/files/company_tickers.json"
    response = requests.get(url)
    json_data = response.json()
    file_name = "company_tickers.json"
    with open(file_name, "w") as json_file:
        json.dump(json_data, json_file)
    print(f"JSON data saved to {file_name}")

def loadData():
    file_name = "company_tickers.json"
    with open(file_name, "r") as json_file:
        loaded_json_data = json.load(json_file)
    df = pd.DataFrame(loaded_json_data)
    df = df.transpose()
    df.title = df.title.str.title()
    df = df.rename({"title":"Company"},axis=1)
    df = df[~df.ticker.str.contains('-')]
    df = df.drop_duplicates(subset='Company', keep='first')
    df.to_csv("SEC-CompanyTicker.csv")
    return df
df = loadData()


In [94]:
df[df.Company.str.contains("Sculptor")]

Unnamed: 0,cik_str,ticker,Company
3682,1403256,SCU,"Sculptor Capital Management, Inc."


In [86]:
from sentence_transformers import SentenceTransformer
import faiss
import requests
import json
import pandas as pd
import numpy as np
companyNames = pd.read_csv("SEC-Data.csv")
companyNames = pd.DataFrame({"Company":["Wells Fargo","Sculptor"]}) 


class Similarity:  
    def __init__(self):
        self.possibleCompanies = pd.read_csv("SEC-CompanyTicker.csv",index_col=0)

        
    def TrainFaiss(self):
        self.encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
        vectors = self.encoder.encode(list(self.possibleCompanies.Company))
        vector_dimension = vectors.shape[1]
        self.index = faiss.IndexFlatL2(vector_dimension)
        faiss.normalize_L2(vectors)
        self.index.add(vectors)
        
    def getCompanyMatch(self, companyNames):
        matches = []
        for company in companyNames["Company"]:
            match = self.predict(company)
            matches.append(match)
        companyNames["Matches"] = matches
        companyNames = pd.merge(companyNames, self.possibleCompanies, left_on="Matches", right_on="Company")
        return companyNames
            
    def predict(self,search_text):
        search_vector = self.encoder.encode(search_text)
        _vector = np.array([search_vector])
        faiss.normalize_L2(_vector)
        k = self.index.ntotal
        distances, ann = self.index.search(_vector, k=k)
        return self.possibleCompanies.Company[ann[0][0]]
    
    

In [87]:

matcher = Similarity()


In [88]:
matcher.TrainFaiss()


In [95]:
matcher.getCompanyMatch(companyNames)

Unnamed: 0,Company_x,Matches,cik_str,ticker,Company_y
0,Wells Fargo,Totalenergies Se,879764,TTE,Totalenergies Se
1,Sculptor,Southport Acquisition Corp,1865200,PORT,Southport Acquisition Corp


In [96]:
matcher.predict("Sculptor")

'Southport Acquisition Corp'

In [50]:
df = pd.read_csv("SEC-Data.csv",index_col=0)

matches = matcher.getCompanyMatch(df)

In [49]:
matcher.loadData()

Unnamed: 0,cik_str,ticker,Companies
0,320193,AAPL,Apple Inc.
1,789019,MSFT,Microsoft Corp
2,1652044,GOOGL,Alphabet Inc.
3,1018724,AMZN,Amazon Com Inc
4,1045810,NVDA,Nvidia Corp
...,...,...,...
10893,1945711,LVROW,Lavoro Ltd
10894,1898795,LVWR-WT,"Livewire Group, Inc."
10895,1837344,MBTCR,Nocturne Acquisition Corp
10896,1837344,MBTCU,Nocturne Acquisition Corp


In [98]:

possibleCompanies = pd.read_csv("SEC-CompanyTicker.csv",index_col=0)

encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
vectors = encoder.encode(list(possibleCompanies.Company))
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)


In [102]:
 search_vector = encoder.encode("Sculptor")
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)
k = index.ntotal
distances, ann = index.search(_vector, k=k)
distances

array([[0.7488366, 1.1460173, 1.1958046, ..., 2.1808012, 2.196936 ,
        2.221943 ]], dtype=float32)

In [114]:
possibleCompanies.iloc[[ann[0][2]]]

Unnamed: 0,cik_str,ticker,Company
7064,1617669,UFAB,"Unique Fabricating, Inc."


In [111]:
k

8326

In [109]:
ann[0][1]

8252

In [112]:
possibleCompanies

Unnamed: 0,cik_str,ticker,Company
0,320193,AAPL,Apple Inc.
1,789019,MSFT,Microsoft Corp
2,1652044,GOOGL,Alphabet Inc.
3,1018724,AMZN,Amazon Com Inc
4,1045810,NVDA,Nvidia Corp
...,...,...,...
10719,1843862,EOCWF,Elliott Opportunity Ii Corp.
10721,1698538,STSR,"Strategic Student & Senior Housing Trust, Inc."
10722,874710,AHPIQ,Allied Healthcare Products Inc
10758,13372,NSARO,Nstar Electric Co
