In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from IPython.core.display import display, HTML
#set display options for pandas dataframes
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
display(HTML("<style>.container { width:100% !important; }</style>"))
local_path = "/Users/roellk/HMDA/gleif/gleif_parsing/"

In [3]:
#load GLEIF LEI list
gleif_df = pd.read_csv(local_path+"data/lei_info.txt", sep="|", dtype="object")
fill_cols = ["legal_city", "legal_name", "hq_city", "hq_country"]
gleif_df.fillna("-1", inplace=True)
gleif_df = gleif_df[gleif_df.legal_country=="US"] #filter for only US companies to speed matching
#split legal region to capture only 2 digit state code EG: MT
gleif_df['state'] = gleif_df.legal_region.apply(lambda x: x[-2:] if x != "-1" else "")
gleif_df[gleif_df.state!=""].state.head() #check output

10876    DE
15420    NY
15426    KY
15440    NY
15441    CT
Name: state, dtype: object

In [25]:
#load NIC IDs (this is panel with addresses pulled from NIC)
ts_data = pd.read_csv(local_path+"data/ts_addresses.csv", sep=",", dtype="object")
ts_data.columns = ["institution_id", "agency", "rid", "name", "state", "city", "ts_address", "ts_city", "ts_state"]
ts_data.fillna("-1", inplace=True)
print(len(ts_data))
ts_data.dtypes


448


institution_id    object
agency            object
rid               object
name              object
state             object
city              object
ts_address        object
ts_city           object
ts_state          object
dtype: object

In [30]:
# create column with name, address, city, state to match to gleif
ts_data['match_string'] = ts_data.apply(lambda x: str(x.name) + " " + str(x.ts_city) + " " + str(x.ts_state) + " " + str(x.ts_address), axis=1)
ts_data.head()

Unnamed: 0,institution_id,agency,rid,name,state,city,ts_address,ts_city,ts_state,match_string
0,3875151,7,770575554,SANTA CRUZ HOME FINANCE,CA,SANTA CRUZ,1535 SEABRIGHT AVENUE,SANTA CRUZ,CA,0 SANTA CRUZ CA 1535 SEABRIGHT AVENUE
1,4186658,7,330106473,ARROWHEAD CAPITAL CORPORATION,CA,REDLANDS,1425 W LUGONIA AVE STE 102,REDLANDS,CA,1 REDLANDS CA 1425 W LUGONIA AVE STE 102
2,4437510,7,352195549,"HAMILTON GROUP FUNDING, INC.",FL,COOPER CITY,"1551 SAWGRASS CORPORATE PARKWAY, STE 300",SUNRISE,FL,"2 SUNRISE FL 1551 SAWGRASS CORPORATE PARKWAY, ..."
3,3878460,7,911395192,"NETWORK MORTGAGE SERVICES, INC.",WA,LYNNWOOD,"19101 36TH AVE W, SUITE 103",LYNNWOOD,WA,"3 LYNNWOOD WA 19101 36TH AVE W, SUITE 103"
4,3890097,5,880500878,GREATER NEVADA MORTGAGE SERVICE,NV,CARSON CITY,4070 SILVER SAGE DRIVE,CARSON CITY,NV,4 CARSON CITY NV 4070 SILVER SAGE DRIVE


In [31]:
#gleif: legal_name, legal_city, legal_region[-2], legal_street1
gleif_df['match_string_gleif'] = gleif_df.apply(lambda x: x.legal_name + " " + x.legal_city + " " +
                                               x.state + " " + x.legal_street1, axis=1)



In [33]:
count = 0
ts_data["match_tup"] = ""
for index, row in ts_data[ts_data.match_string!="-1 -1 -1 -1"].iterrows():
    print(index, row.match_string)
    #match_output = row.apply(lambda x: process.extractOne(x.match_string, gleif_df.match_string_gleif))
    match_output = process.extractOne(ts_data.at[index, "match_string"], gleif_df.match_string_gleif)
    #print(ts_data.at[index, "match_string"])
    ts_data.at[index, "match_tup"] = match_output
    
    if count % 10 == 0 and count >=10:
        ts_data.to_csv("../output/ts_results_"+str(count)+".txt", sep="|", index=False)
        print("saving {count}".format(count=count))

0 0 SANTA CRUZ CA 1535 SEABRIGHT AVENUE
1 1 REDLANDS CA 1425 W LUGONIA AVE STE 102
2 2 SUNRISE FL 1551 SAWGRASS CORPORATE PARKWAY, STE 300
3 3 LYNNWOOD WA 19101 36TH AVE W, SUITE 103
4 4 CARSON CITY NV 4070 SILVER SAGE DRIVE
5 5 Redding CA 2285 Hilltop Drive, Suite 100
6 6 Ladera Ranch CA 111 Corporate Drive, Suite 270
7 7 MINOT ND 615 S BROADWAY
8 8 LINDON UT 380 TECHNOLOGY CT, SUITE 200
9 9 ALAMO CA 3236 STONE VALLEY ROAD W
10 10 TUCSON AZ 6245 E. BROADWAY BLVD., SUITE 400
11 11 West Palm Beach FL 1661 Worthingtion Road Suite 100
12 12 Little Rock AR 15909 CANTRELL ROAD
13 13 SEATTLE WA PO BOX 75989
14 14 Houston TX 5120 Woodway Drive Suite #5020
15 15 Staunton VA PO Box 1285
16 16 Evansville IN 501 N Cross Pointe Blvd
17 17 Danville CA 390 Diablo Road Suite 100
18 18 Buffalo NY 701 Seneca Street-Suite 330N
19 19 Dearborn MI 1 Autoclub Drive
20 20 Richardson TX 601 N. Plano Rd.
21 21 Long Island City NY 1 Court Square, 43rd Floor
22 22 IRVINE CA 18201 VON KARMAN AVE. SUITE 300
23 23 

175 175 NEW YORK NY 1120 AVENUE OF THE AMERICAS
176 176 Sandy UT 9272 S. 700 E.
177 177 SAN DIEGO CA 11234 EL CAMINO REAL #100
178 178 LAKE OSWEGO OR 4949 MEADOWS RD, STE 350
179 179 Bellevue WA 3015 112th Ave NE Ste 214
180 180 CHICAGO IL 1800 WEST LARCHMONT AVE
181 181 RANCHO CUCAMONGA CA 8010 HAVEN AVE
182 182 Marlton NJ 701 Route 73 N Suite 2
183 183 CHESTERFIELD MO 14897 CLAYTON ROAD
184 184 Brook Park OH 15887 Snow Road, Suite 200
185 185 CERRITOS CA 18000 STUDEBAKER ROAD, SUITE 200
186 186 TEMPE AZ 1018 E. GUADALUPE ROAD
187 187 MODESTO CA 1156 SCENIC DRIVE SUITE 200
188 188 NEW YORK NY 230 PARK AVE FL 19
189 189 Castle Rock CO 103 4th St. Suite 220
190 190 CRANFORD NJ 20 JACKSON DRIVE 2ND FLOOR
191 191 VIRGINIA BEACH VA 4433 CORPORATION LANE SUITE 300
192 192 Fresno CA 7644 N Palm Avenue
193 193 Santa Ana CA 4 Hutton Centre, 10th Floor
194 194 VIENNA VA 8150 LEESBURG PIKE #410
195 195 DENVER CO 4350 S MONACO STREET, SUITE 200
196 196 Fayetteville GA 145 West Lanier Ave.
197 197

352 352 Oklahoma City OK 9400 S I 35 Service Road
353 353 Marlton NJ 875 ROUTE 73 NORTH Suite F
354 354 Irving TX 8900 Freeport Pkwy, Ste 150
355 355 GILBERT AZ 3345 S. VAL VISTA DRIVE, SUITE 300
356 356 SAN JOSE CA 5615 CHESBRO AVE SUITE 110
357 357 FRANKLIN TN 6100 TOWER CIRCLE SUITE 600
358 358 McLean VA 2010 Corporate Ridge Ste 750
359 359 SAUSALITO CA 28 LIBERTY SHIPWAY SUITE 2800
360 360 WOODLAND HILLS CA 5900 CANOGA AVENUE SUITE 200
361 361 St Louis MO 1600 S Brentwood
362 362 Eden Prairie MN 11200 West 78th Street, Suite 50
363 363 HOUSTON TX 10410 WINDERMERE LAKES BLVD, STE 401-A
364 364 ONTAIO CA 3257 E. GUASTI ROAD, SUITE #320
365 365 Braintree MA 1000 Washington Street
366 366 SOUTH MIAMI FL 5820 BIRD ROAD
367 367 SANTA ANA CA 2510 RED HILL AVE
368 368 Maryland Heights MO 2312 Millpark Dr.
369 369 HUNT VALLEY MD 111011 MCCORMICK RD, SUITE 400
370 370 Birmingham AL 500 Office Park Drive, Suite 310
371 371 Scottsdale AZ 7025 E Greenway Parkway Suite 100
372 372 Henderson TX P

In [37]:
ts_data.to_csv("../output/ts_match.txt", sep="|", index=False)

In [None]:
#split ts_data match tuple
#get LEI by index
#sort by match score, 95 is approximate low end
