#### Imports

In [47]:
import numpy as np
import pandas as pd
import joblib

In [48]:
import utils as u
from model_utils import Model

In [49]:
%load_ext autoreload
%autoreload 2
###
%load_ext lab_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


#### Text Loader

In [3]:
data_location  = 'interim/'

In [4]:
data_1 = pd.read_excel('interim/data_1.xlsx', sheet_name=None)

In [5]:
data_1_final = []
for sheet in data_1.keys():
    title = data_1[sheet].columns
    title = title[-1]
    temp_df = data_1[sheet].copy()
    temp_df['Source'] = title
    temp_df['Data_1'] = temp_df[title]
    temp_df = temp_df[['Data_1']]
    data_1_final.append(temp_df)

data_1_final  = pd.concat(data_1_final, ignore_index=True)

In [6]:
data_2 = pd.read_excel('interim/data_2.xlsx', sheet_name=None)

In [7]:
data_2_final = []
for sheet in data_2.keys():
    title = data_2[sheet].columns
    title = title[-1]
    temp_df = data_2[sheet].copy()
    temp_df['Source'] = title
    temp_df['Data_2'] = temp_df[title]
    temp_df = temp_df[['Data_2']]
    data_2_final.append(temp_df)

data_2_final  = pd.concat(data_2_final, ignore_index=True)

In [8]:
def cartesian_product(*arrays):
    la = len(arrays)
    dtype = np.result_type(*arrays)
    arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
    for i, a in enumerate(np.ix_(*arrays)):
        arr[...,i] = a
    return arr.reshape(-1, la) 

In [9]:
def cartesian_product_multi(*dfs):
    idx = cartesian_product(*[np.ogrid[:len(df)] for df in dfs])
    return pd.DataFrame(
        np.column_stack([df.values[idx[:,i]] for i,df in enumerate(dfs)]))

test_df = cartesian_product_multi(*[data_1_final, data_2_final])


In [10]:
test_df.shape

(54696668, 2)

In [11]:
test_df.head()

Unnamed: 0,0,1
0,Yardi,OTTO HARRASSOWITZ GMBH & CO
1,Yardi,GOLDMAN SACHS ASSET MANAGEMENT INTERNATIONAL
2,Yardi,ADDISON LEE PLC DD
3,Yardi,STATE STREET GLOBAL
4,Yardi,SLOANE ROBINSON INVESTMENT SERVICES


In [12]:
test_df.columns = ['data_1','data_2']

In [None]:
import math

def index_marks(nrows, chunk_size):
    return range(chunk_size, math.ceil(nrows / chunk_size) * chunk_size, chunk_size)

def split(dfm, chunk_size):
    indices = index_marks(dfm.shape[0], chunk_size)
    return np.split(dfm, indices)

test_df_ = split(test_df, 100)

In [21]:
def rolling(df, window, step):
    count = 0
    df_length = len(df)
    while count < (df_length - window):
        yield count, df[count : window + count]
        count += step
    else:
        yield count, df[count : window + count]

#### ML Results

In [28]:
# These are words to
words_to_ignore = [
    "capital",
    "management",
    "partners",
    "group",
    "&",
    "asset",
    "investment",
    "of",
    "fund",
    "services",
    "insurance",
    "global",
    "financial",
    "the",
    "investments",
    "consulting",
    "bank",
    "international",
    "solutions",
    "wealth",
    "pension",
    "associates",
    "media",
    "new",
    "london",
    "risk",
    "securities",
    "real",
    "gaming",
    "estate",
    "trust",
    "co",
    "office",
    "family",
    "company",
    "de",
    "research",
    "funds",
    "foundation",
]

# These common words will be used in searching for acronyms
common_words = []
with open("1-1000.txt") as f:
    for line in f:
        common_words.append(line.rstrip())

In [42]:
non_numerical_cols = ["name_a", "name_b", "acr_a", "acr_b"]
feature_columns = [
    "acr_match",
    "JW_distance",
    "LV_distance",
    "num_words_a",
    "num_words_b",
    "len_a",
    "len_b",
]

In [65]:
def predict_comparison(datframe, name_column_1, name_column_2, trained_model):
    base_df = pd.DataFrame()

    base_df["name_a"] = datframe[name_column_1]
    base_df["name_b"] = datframe[name_column_2]

    # Get the acronyms for a and b
    base_df["acr_a"] = base_df["name_a"].apply(
        lambda x: u.get_acronym(x, words_to_ignore, common_words)
    )
    base_df["acr_b"] = base_df["name_b"].apply(
        lambda x: u.get_acronym(x, words_to_ignore, common_words)
    )

    # Create a numerical field for the same acronyms
    base_df["acr_match"] = base_df.apply(
        lambda row: u.acronym_checker(row["acr_a"], row["acr_b"]), axis=1
    )

    # Get the number of words
    base_df["num_words_a"] = base_df["name_a"].apply(lambda x: u.get_number_words(x))
    base_df["num_words_b"] = base_df["name_b"].apply(lambda x: u.get_number_words(x))

    # Get the length of the strings
    base_df["len_a"] = base_df["name_a"].apply(lambda x: len(str(x)))
    base_df["len_b"] = base_df["name_b"].apply(lambda x: len(str(x)))

    # Get Jaro Winkler distance
    base_df["JW_distance"] = base_df.apply(
        lambda row: u.jaro_winkler_distance(row["name_a"], row["name_b"]), axis=1
    )

    # Get Levenshtein distance
    base_df["LV_distance"] = base_df.apply(
        lambda row: u.levenshtein_distance(row["name_a"], row["name_b"]), axis=1
    )

    # Get the target

    X_test = base_df[feature_columns]

    base_df["prediction"] = trained_model.predict(X_test)

    display(base_df["prediction"].value_counts())

    return base_df

### Load Model

In [53]:
# load the model from disk
loaded_model = joblib.load("trained_model/finalized_model.sav")

In [59]:
%store -r X_test

In [60]:
loaded_model.predict(X_test)

array([1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,

In [68]:
for offset, window in rolling(test_df[:300], 100, 100):
    # |     |                      |     |
    # |     The current chunk.     |     How many rows to step at a time.
    # The current offset index.    How many rows in each chunk.
    # your code here
    display(predict_comparison(window, "data_1", "data_2", loaded_model))

    pass

1    100
Name: prediction, dtype: int64

Unnamed: 0,name_a,name_b,acr_a,acr_b,acr_match,num_words_a,num_words_b,len_a,len_b,JW_distance,LV_distance,prediction
0,Yardi,OTTO HARRASSOWITZ GMBH & CO,,otto,0,1,5,6,27,0.401235,26,1
1,Yardi,GOLDMAN SACHS ASSET MANAGEMENT INTERNATIONAL,,,0,1,5,6,44,0.396465,43,1
2,Yardi,ADDISON LEE PLC DD,,lee,0,1,4,6,18,0.407407,17,1
3,Yardi,STATE STREET GLOBAL,,,0,1,3,6,19,0.406433,18,1
4,Yardi,SLOANE ROBINSON INVESTMENT SERVICES,,,0,1,4,6,35,0.398413,34,1
...,...,...,...,...,...,...,...,...,...,...,...,...
95,Yardi,TRADER,,,0,1,1,6,6,0.000000,6,1
96,Yardi,PEKING UNIVERSITY,,,0,1,2,6,17,0.408497,16,1
97,Yardi,BEIJING FOREIGN STUDIES UNIVERSITY,,,0,1,4,6,34,0.398693,33,1
98,Yardi,SUNWAY UNIVERSITY,,,0,1,2,6,17,0.483660,16,1


1    100
Name: prediction, dtype: int64

Unnamed: 0,name_a,name_b,acr_a,acr_b,acr_match,num_words_a,num_words_b,len_a,len_b,JW_distance,LV_distance,prediction
100,Yardi,ASTON UNIVERSITY PENSION SCHEME,,,0,1,4,6,31,0.399642,29,1
101,Yardi,JINAN UNIVERSITY,,,0,1,2,6,16,0.409722,15,1
102,Yardi,MINZU UNIVERSITY OF CHINA,,,0,1,4,6,25,0.402222,24,1
103,Yardi,NANJING UNIVERSITY OF SCIENCE & TECHNOLOGY,,,0,1,6,6,42,0.293651,40,1
104,Yardi,EASTERN MEDITERRANEAN UNIVERSITY,,,0,1,3,6,32,0.399306,31,1
...,...,...,...,...,...,...,...,...,...,...,...,...
195,Yardi,PAYDEN & RYGEL,,,0,1,3,6,14,0.492063,12,1
196,Yardi,HUAJIN SECURITY,,,0,1,2,6,15,0.411111,14,1
197,Yardi,PGIM,,pgim,0,1,1,6,4,0.000000,6,1
198,Yardi,PRICEWATERHOUSECOOPERS PWC,,pwc,0,1,2,6,26,0.000000,25,1


1    100
Name: prediction, dtype: int64

Unnamed: 0,name_a,name_b,acr_a,acr_b,acr_match,num_words_a,num_words_b,len_a,len_b,JW_distance,LV_distance,prediction
200,Yardi,CHINA MERCHANTS BANK,,,0,1,3,6,20,0.405556,19,1
201,Yardi,ROPES & GRAY,,,0,1,3,6,12,0.416667,11,1
202,Yardi,GUIZHOU BRAUN ENTERPRISE GROUP,,,0,1,4,6,30,0.400000,29,1
203,Yardi,HARVEST FUND MANAGEMENT CO,,,0,1,4,6,26,0.401709,25,1
204,Yardi,SKADDEN ARPS SLATE MEAGHER & FLOM,,arps,0,1,6,6,33,0.398990,32,1
...,...,...,...,...,...,...,...,...,...,...,...,...
295,Yardi,SPARKLING,,,0,1,1,6,9,0.000000,9,1
296,Yardi,TESCO STORES,,,0,1,2,6,12,0.416667,11,1
297,Yardi,LIBRA EQUITY,,,0,1,2,6,12,0.416667,11,1
298,Yardi,CEDAR ROSE INT SERVICES,,int,0,1,4,6,23,0.403382,22,1


In [None]:
base_df.to_csv('interim/large_excel.csv')