#### Imports

In [3]:
import numpy as np
import pandas as pd
import joblib

In [4]:
import utils as u
from model_utils import Model

In [5]:
%load_ext autoreload
%autoreload 2
###
%load_ext lab_black

#### Text Loader

In [12]:
data_location = "interim/"

In [13]:
data_1 = pd.read_excel("interim/data_1.xlsx", sheet_name=None)

In [17]:
data_1_final = []
for sheet in data_1.keys():
    title = data_1[sheet].columns
    title = title[-1]
    temp_df = data_1[sheet].copy()
    temp_df["Source"] = title
    temp_df["data_1"] = temp_df[title]
    temp_df = temp_df[["data_1"]]
    data_1_final.append(temp_df)

data_1_final = pd.concat(data_1_final, ignore_index=True)

In [18]:
data_2 = pd.read_excel("interim/data_2.xlsx", sheet_name=None)

In [19]:
data_2_final = []
for sheet in data_2.keys():
    title = data_2[sheet].columns
    title = title[-1]
    temp_df = data_2[sheet].copy()
    temp_df["Source"] = title
    temp_df["data_2"] = temp_df[title]
    temp_df = temp_df[["data_2"]]
    data_2_final.append(temp_df)

data_2_final = pd.concat(data_2_final, ignore_index=True)

In [31]:
data_1_final = data_1_final.drop_duplicates()
data_2_final = data_2_final.drop_duplicates()

In [34]:
def rolling(df, window, step):
    count = 0
    df_length = len(df)
    while count < (df_length - window):
        yield count, df[count : window + count]
        count += step
    else:
        yield count, df[count : window + count]

def cartesian_product(*arrays):
    la = len(arrays)
    dtype = np.result_type(*arrays)
    arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
    for i, a in enumerate(np.ix_(*arrays)):
        arr[..., i] = a
    return arr.reshape(-1, la)


def cartesian_product_multi(*dfs):
    idx = cartesian_product(*[np.ogrid[: len(df)] for df in dfs])
    return pd.DataFrame(
        np.column_stack([df.values[idx[:, i]] for i, df in enumerate(dfs)])
    )

#### ML Results

In [37]:
# These are words to
words_to_ignore = [
    "capital",
    "management",
    "partners",
    "group",
    "&",
    "asset",
    "investment",
    "of",
    "fund",
    "services",
    "insurance",
    "global",
    "financial",
    "the",
    "investments",
    "consulting",
    "bank",
    "international",
    "solutions",
    "wealth",
    "pension",
    "associates",
    "media",
    "new",
    "london",
    "risk",
    "securities",
    "real",
    "gaming",
    "estate",
    "trust",
    "co",
    "office",
    "family",
    "company",
    "de",
    "research",
    "funds",
    "foundation",
]

# These common words will be used in searching for acronyms
common_words = []
with open("1-1000.txt") as f:
    for line in f:
        common_words.append(line.rstrip())

In [38]:
non_numerical_cols = ["name_a", "name_b", "acr_a", "acr_b"]
feature_columns = [
    "acr_match",
    "JW_distance",
    "LV_distance",
    "num_words_a",
    "num_words_b",
    "len_a",
    "len_b",
]

In [39]:
def predict_comparison(dataframe, name_column_1, name_column_2, trained_model):
    base_df = pd.DataFrame()

    base_df["name_a"] = dataframe[name_column_1].astype(str)
    base_df["name_b"] = dataframe[name_column_2].astype(str)

    # Get the acronyms for a and b
    base_df["acr_a"] = base_df["name_a"].apply(
        lambda x: u.get_acronym(x, words_to_ignore, common_words)
    )
    base_df["acr_b"] = base_df["name_b"].apply(
        lambda x: u.get_acronym(x, words_to_ignore, common_words)
    )

    # Create a numerical field for the same acronyms
    base_df["acr_match"] = base_df.apply(
        lambda row: u.acronym_checker(row["acr_a"], row["acr_b"]), axis=1
    )

    # Get the number of words
    base_df["num_words_a"] = base_df["name_a"].apply(lambda x: u.get_number_words(x))
    base_df["num_words_b"] = base_df["name_b"].apply(lambda x: u.get_number_words(x))

    # Get the length of the strings
    base_df["len_a"] = base_df["name_a"].apply(lambda x: len(str(x)))
    base_df["len_b"] = base_df["name_b"].apply(lambda x: len(str(x)))

    # Get Jaro Winkler distance
    base_df["JW_distance"] = base_df.apply(
        lambda row: u.jaro_winkler_distance(row["name_a"], row["name_b"]), axis=1
    )

    # Get Levenshtein distance
    base_df["LV_distance"] = base_df.apply(
        lambda row: u.levenshtein_distance(row["name_a"], row["name_b"]), axis=1
    )

    # Get the target

    X_test = base_df[feature_columns]

    base_df["prediction"] = trained_model.predict(X_test)
    base_df["prediction_proba"] = trained_model.predict_proba(X_test)[:, 1]

    return base_df

### Load Model

In [41]:
# load the model from disk
loaded_model = joblib.load("trained_model/finalized_model.sav")

### Test Model Output

In [48]:
matching_df = []
for offset, window in rolling(data_1_final[:2], 1, 1):
    # |     |                      |     |
    # |     The current chunk.     |     How many rows to step at a time.
    # The current offset index.    How many rows in each chunk.
    # your code here

    temp_df = cartesian_product_multi(*[window, data_2_final])
    temp_df.columns = ["data_1", "data_2"]
    temp_df = predict_comparison(temp_df, "data_1", "data_2", loaded_model)
    temp_df = temp_df[temp_df["prediction"] == 0]

    #     if not temp_df.empty:
    #         temp_df.to_csv("calculated/matching_df__{}.csv".format(offset))

    pass

Unnamed: 0,name_a,name_b,acr_a,acr_b,acr_match,num_words_a,num_words_b,len_a,len_b,JW_distance,LV_distance,prediction,prediction_proba
0,Yardi,OTTO HARRASSOWITZ GMBH & CO,,otto,0,1,5,6,27,0.401235,26,1,0.635891
1,Yardi,GOLDMAN SACHS ASSET MANAGEMENT INTERNATIONAL,,,0,1,5,6,44,0.396465,43,1,0.637190
2,Yardi,ADDISON LEE PLC DD,,lee,0,1,4,6,18,0.407407,17,1,0.640299
3,Yardi,STATE STREET GLOBAL,,,0,1,3,6,19,0.406433,18,1,0.694958
4,Yardi,SLOANE ROBINSON INVESTMENT SERVICES,,,0,1,4,6,35,0.398413,34,1,0.621524
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43834,Yardi,MDD FORENSIC ACCOUNTANTS,,mdd,0,1,3,6,24,0.402778,23,1,0.696060
43835,Yardi,SERRA STREET,,,0,1,2,6,12,0.416667,11,1,0.567021
43836,Yardi,UNIVERSITY OF CHINESE ACADEMY OF SOCIAL SCIENCES,,,0,1,7,6,48,0.458333,46,1,0.650190
43837,Yardi,JEM RACING,,jem,0,1,2,6,10,0.422222,10,1,0.568603


Unnamed: 0,name_a,name_b,acr_a,acr_b,acr_match,num_words_a,num_words_b,len_a,len_b,JW_distance,LV_distance,prediction,prediction_proba
0,CBRE Capital Advisors,OTTO HARRASSOWITZ GMBH & CO,cbre,otto,0,3,5,22,27,0.337486,25,1,0.630381
1,CBRE Capital Advisors,GOLDMAN SACHS ASSET MANAGEMENT INTERNATIONAL,cbre,,0,3,5,22,44,0.358586,39,1,0.641890
2,CBRE Capital Advisors,ADDISON LEE PLC DD,cbre,lee,0,3,4,22,18,0.368350,21,1,0.624984
3,CBRE Capital Advisors,STATE STREET GLOBAL,cbre,,0,3,3,22,19,0.430144,20,1,0.550364
4,CBRE Capital Advisors,SLOANE ROBINSON INVESTMENT SERVICES,cbre,,0,3,4,22,35,0.363203,31,1,0.626370
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43834,CBRE Capital Advisors,MDD FORENSIC ACCOUNTANTS,cbre,mdd,0,3,3,22,24,0.441378,22,1,0.580324
43835,CBRE Capital Advisors,SERRA STREET,cbre,,0,3,2,22,12,0.338384,20,1,0.727308
43836,CBRE Capital Advisors,UNIVERSITY OF CHINESE ACADEMY OF SOCIAL SCIENCES,cbre,,0,3,7,22,48,0.468434,42,1,0.654536
43837,CBRE Capital Advisors,JEM RACING,cbre,jem,0,3,2,22,10,0.442424,20,1,0.725980
