### Fuzzy String Matcher (Basic)

In [None]:
import pandas as pd
import numpy as np
from string_grouper import match_strings, match_most_similar, \
	group_similar_strings, compute_pairwise_similarities, \
	StringGrouper
import cleanco

In [None]:
## directory info
rootDataPath = 'INSERT YOUR FILEPATH HERE'

In [None]:
## file read
matchlist = pd.read_csv(f'{rootDataPath}company_matches_pre_cleaning.csv')

In [None]:
## establish reference list for all matches to be matched against
ref_list = matchlist[['companyId','companyName']].drop_duplicates().copy()

## remove the following words from the company names
pattern = '|'.join(['-','_','technologies','technology','pharmaceuticals','partners', 'international',
'financial services','management','consulting','industries','entertainment', 'enterprises','com'
'networks','life sciences','capital markets','communications','resources','healthcare','therapeutics','health care'])

## replace those words
ref_list['dehyphenName'] = ref_list.companyName.str.replace(pattern,' ')

## apply company name cleaning
ref_list['cleanMain'] = ref_list.apply(lambda x: cleanco.basename(x.dehyphenName), axis=1)

In [None]:
## establish list of companies to be matched (the reference list is where we'll be matching them)
to_match_list = matchlist[['companyName2']].drop_duplicates().copy().astype(str)
to_match_list['dehyphenName'] = to_match_list.companyName2.str.replace(pattern, ' ')
to_match_list['cleanToMatch'] = to_match_list.apply(lambda x: cleanco.basename(x.dehyphenName), axis=1)

In [None]:
## break apart the match list to speed things up into blocks of 600k
matchlists_chunks = [to_match_list.cleanToMatch[0:600000].copy(), to_match_list.cleanToMatch[600000:1200000].copy(), to_match_list.cleanToMatch[1200000:].copy()]
matchlists_chunks = [item.astype(str) for item in matchlists_chunks]

In [None]:
## test to check similarity threshold. going with 0.67 for now.
matches1 = match_strings(ref_list.cleanMain, matchlists_chunks[0], n_blocks='auto', min_similarity=0.67)
matches1.sort_values(by='similarity', ascending=True).head(20)

In [None]:
## concatenate all of the matches
matches = pd.concat([match_strings(ref_list.cleanMain, item, n_blocks='auto', min_similarity=0.67) for item in matchlists_chunks], axis=0, ignore_index=True)
matches.sort_values(by='similarity', ascending=False)

In [63]:
def retrieveData():
    url = "https://www.sec.gov/files/company_tickers.json"
    response = requests.get(url)
    json_data = response.json()
    file_name = "company_tickers.json"
    with open(file_name, "w") as json_file:
        json.dump(json_data, json_file)
    print(f"JSON data saved to {file_name}")

def loadData():
    file_name = "company_tickers.json"
    with open(file_name, "r") as json_file:
        loaded_json_data = json.load(json_file)
    df = pd.DataFrame(loaded_json_data)
    df = df.transpose()
    df.title = df.title.str.title()
    df = df.rename({"title":"Company"},axis=1)
    df = df[~df.ticker.str.contains('-')]
    df = df.drop_duplicates(subset='Company', keep='first')
    df.to_csv("SEC-CompanyTicker.csv")
    return df
df = loadData()


In [70]:
from sentence_transformers import SentenceTransformer
import faiss
import requests
import json
import pandas as pd
import numpy as np
companyNames = pd.read_csv("SEC-Data.csv")
companyNames = pd.DataFrame({"Company":["Wells Fargo","Sculptor"]}) 


class Similarity:  
    def __init__(self):
        self.possibleCompanies = pd.read_csv("SEC-CompanyTicker.csv",index_col=0)

        
    def TrainFaiss(self):
        self.encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
        vectors = self.encoder.encode(list(self.possibleCompanies.Company))
        vector_dimension = vectors.shape[1]
        self.index = faiss.IndexFlatL2(vector_dimension)
        faiss.normalize_L2(vectors)
        self.index.add(vectors)
        
    def getCompanyMatch(self, companyNames):
        matches = []
        for company in companyNames["Company"]:
            match = self.predict(company)
            matches.append(match)
        companyNames["Matches"] = matches
        companyNames = pd.merge(companyNames, self.PossibleCompanies, left_on="Matches", right_on="Company")
        return companyNames
            
    def predict(self,search_text):
        search_vector = self.encoder.encode(search_text)
        _vector = np.array([search_vector])
        faiss.normalize_L2(_vector)
        k = self.index.ntotal
        distances, ann = self.index.search(_vector, k=k)
        return self.possibleCompanies[ann[0][0]]
    
    

In [71]:

matcher = Similarity()


In [None]:
matcher.TrainFaiss()


In [59]:
matcher.getCompanyMatch(companyNames)

Unnamed: 0,Company_x,Matches,cik_str,ticker,Company_y
0,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC,Wells Fargo & Company/Mn
1,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PQ,Wells Fargo & Company/Mn
2,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PY,Wells Fargo & Company/Mn
3,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PR,Wells Fargo & Company/Mn
4,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PL,Wells Fargo & Company/Mn
5,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PC,Wells Fargo & Company/Mn
6,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PD,Wells Fargo & Company/Mn
7,Wells Fargo,Wells Fargo & Company/Mn,72971,WFCNP,Wells Fargo & Company/Mn
8,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PA,Wells Fargo & Company/Mn
9,Wells Fargo,Wells Fargo & Company/Mn,72971,WFC-PZ,Wells Fargo & Company/Mn


In [50]:
df = pd.read_csv("SEC-Data.csv",index_col=0)

matches = matcher.getCompanyMatch(df)

In [49]:
matcher.loadData()

Unnamed: 0,cik_str,ticker,Companies
0,320193,AAPL,Apple Inc.
1,789019,MSFT,Microsoft Corp
2,1652044,GOOGL,Alphabet Inc.
3,1018724,AMZN,Amazon Com Inc
4,1045810,NVDA,Nvidia Corp
...,...,...,...
10893,1945711,LVROW,Lavoro Ltd
10894,1898795,LVWR-WT,"Livewire Group, Inc."
10895,1837344,MBTCR,Nocturne Acquisition Corp
10896,1837344,MBTCU,Nocturne Acquisition Corp


Unnamed: 0,Company,Matches
0,3Com Corp,3M Co
1,3M Company,3M Co
2,A.G. Edwards Inc.,Edwards Lifesciences Corp
3,Abbott Laboratories,Abbott Laboratories
4,Abercrombie & Fitch Co.,Abercrombie & Fitch Co /De/
5,ABM Industries Incorporated,Abm Industries Inc /De/
6,Ace Hardware Corporation,Ace Global Business Acquisition Ltd
7,ACT Manufacturing Inc.,"Actinium Pharmaceuticals, Inc."
8,Acterna Corp.,"Actinium Pharmaceuticals, Inc."
9,"Adams Resources & Energy, Inc.","Adams Resources & Energy, Inc."
