# Load Data

In [None]:
import os
import pandas as pd 
import numpy as np
import logging

## Setup

In [None]:
folder = 'internal_path_to_file'
input_file = 'yellow_yelp_all_pypostal.csv'
output_file = 'yellow_yelp_all_pypostal_output1.csv'
settings_file = 'yellow_yelp_all_pypostal_learned_settings1'
training_file = 'yellow_yelp_all_pypostal_training1.json'

In [None]:
fp = os.path.join(folder, input_file)

In [None]:
matches_file = os.path.join(folder, 'yellow_yelp_label2.csv')

In [None]:
log_level = logging.INFO
log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)


## Dataframe view

In [None]:
input_df = pd.read_csv(fp, sep=',', quotechar='"', dtype={'postalcode':'str'})

In [None]:
def get_clean_postalcode(x):
    
    if x is not None:
        subparts = str(x).split('.')
        return subparts[0]
    else:
        return None

In [None]:
input_df['postalcode'] = input_df['postalcode'].apply(lambda x: get_clean_postalcode(x))

In [None]:
input_df.to_csv(fp, sep=',', quotechar='"', index=False)

In [None]:
categories = list(input_df['category'].unique())
categories = [x for x in categories if str(x) != 'nan']

In [None]:
#category_corpus = input_df[['name', 'category']].drop_duplicates().to_dict(orient='records')
category_corpus = input_df.drop_duplicates().to_dict(orient='records')

In [None]:
category_corpus[0]

In [None]:
type(category_corpus)

In [None]:
phone_corpus = input_df[['name', 'phone']].to_dict(orient='records')

# Dedupe


## Import modules

In [None]:
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

from itertools import product

In [None]:
def pre_process(val):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    try:
        val = re.sub('  +', ' ', val)
        val = re.sub('\n', ' ', val)
        val = val.strip().strip('"').strip("'").lower().strip()
        # If data is missing, indicate that by setting the value to `None`
        if not val:
            val = None
    except Exception as e:
        print(e)
    return val

In [None]:
def get_clean_data_dict(file_path):
    data_d = {}
    with open(fp) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, pre_process(v)) for (k, v) in row.items()]
            row_id = int(row['id'])
            data_d[row_id] = dict(clean_row)

    return data_d

### Get Data in needed format

In [None]:
data_dict = get_clean_data_dict(fp)

In [None]:
data_dict

### Define the Fields for dedupe


In [None]:
fields = [
    {'field' : 'name', 'type': 'Name'},
    {'field' : 'category', 
     'type': 'FuzzyCategorical',
     'categories': categories,
     'corpus': category_corpus,
     'has missing' : True},
    {'field' : 'name', 'type': 'String'},
    {'field': 'postalcode', 'variable name': 'postalcode', 'type': 'Exact'},
    {'field' : 'address', 'type': 'Address'},
    {'field' : 'city', 'type': 'ShortString'},
    {'field' : 'phone', 'type': 'String'},
    {'field' : 'street', 'type': 'String', 'has missing' : True},
    {'field' : 'house_number', 'type': 'Exists', 'has missing' : True},
    {'field' : 'house', 'type': 'String', 'has missing' : True},
    
]

### Instantiate Dedupe

In [None]:
deduper = dedupe.Dedupe(fields)

In [None]:
deduper.prepare_training(data_dict, blocked_proportion=0.7)

In [None]:
dedupe.consoleLabel(deduper)

In [None]:
deduper.train()

In [None]:
with open(training_file, 'w') as tf:
    deduper.writeTraining(tf)

In [None]:
with open(settings_file, 'wb') as sf:
    deduper.writeSettings(sf)

## run dedupe based on prior settings file

In [None]:
deduper = None
with open(settings_file, 'rb') as f:
    deduper = dedupe.StaticDedupe(f)

#### ------ 

In [None]:
threshold = deduper.threshold(data_dict, recall_weight=1)


In [None]:
deduper.predicates

In [None]:
clustered_dupes = deduper.match(data_dict, 0)

In [None]:
print('# duplicate sets', len(clustered_dupes))

In [None]:
cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data_dict[c] for c in id_set]
    canonical_rep = dedupe.canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

In [None]:
singleton_id = cluster_id + 1
with open(output_file, 'w') as f_output, open(fp) as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)

# Predictions

In [None]:
df = pd.read_csv(output_file)

In [None]:
df.columns

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(output_file)
df.sort_values(['Cluster ID'], inplace=True)
relevant_data = df[['Cluster ID', 'confidence_score', 'source', 'id']]

In [None]:
df = pd.read_csv(output_file)
df.sort_values(['Cluster ID'], inplace=True)
relevant_data = df[['Cluster ID', 'confidence_score', 'source', 'id']]

predictions = []
cluster_ids = relevant_data['Cluster ID'].value_counts()
for cluster_id in cluster_ids[cluster_ids>1].index:
    
    fodors_ids = relevant_data[
        (relevant_data['Cluster ID'] == cluster_id) & 
        (relevant_data['source'] == 'yellow_pages')
    ].id.values
    zagats_ids = relevant_data[
        (relevant_data['Cluster ID'] == cluster_id) & 
        (relevant_data['source'] == 'yelp')
    ].id.values
    
    match_interim = list(product(fodors_ids, zagats_ids))
    predictions.append(match_interim)
    
m = []
for cluster in predictions: 
    for combo in cluster: 
        m.append([combo[0], combo[1]])
        
predictions = pd.DataFrame(m, columns=['yellow_pages_id', 'yelp_id'])

predictions['yp-y'] = predictions.apply(lambda row: f"{row['yellow_pages_id']}-{row['yelp_id']}", axis=1)



In [None]:
len(predictions)

In [None]:
results = pd.read_csv(matches_file)
results['yp-y'] = results.apply(lambda row: f"{row['yellow_pages_id']}-{row['yelp_id']}", axis=1)



In [None]:
len(results)

In [None]:
non_duplicates = results[results['duplicate'] == 0]

In [None]:
non_dup_set = set(non_duplicates['yp-y'].values.tolist())

In [None]:
fp = (pred_set & non_dup_set)

In [None]:
fp

In [None]:
len(non_duplicates)

In [None]:
len(duplicates)

In [None]:
yellow_pages_dup_ids = set(duplicates['yellow_pages_id'].to_list())

In [None]:
yelp_dup_ids = set(duplicates['yelp_id'].to_list())

In [None]:
def filter_entries(x):
    use_entry = False
    
    if x['yellow_pages_id'] in yellow_pages_dup_ids:
        use_entry = True
    elif  x['yelp_id'] in yelp_dup_ids:
        use_entry = True
    
    if use_entry:
        return None
    else:
        return x
    
    
    

In [None]:
preds_comparable_with_duplicates = predictions[(predictions['yellow_pages_id'].isin(yellow_pages_dup_ids) == True) 
   | (predictions['yelp_id'].isin(yelp_dup_ids) == True) 
]

In [None]:
len(predictions)

In [None]:
len(preds_comparable_with_duplicates)

In [None]:
pred_set = set(preds_comparable_with_duplicates['yp-y'].values.tolist())

In [None]:
duplicates = results[results['duplicate'] == 1]

In [None]:
results = results[results['duplicate'] == 1]

In [None]:
len(results)

In [None]:
res_set = set(duplicates['yp-y'].values.tolist())
#pred_set = set(predictions['yp-y'].values.tolist())

In [None]:
tp = len(res_set & pred_set)
fn = len(res_set-pred_set)
fp = len(pred_set-res_set)

print(f'tp: {tp} fp: {fp} fn: {fn}')


In [None]:
folder = r'/Hadoco/1_Standard Data Integration/Sample Datasets/Unprocessed Data/customer_samples/fodors_zagats_restaurants'

In [None]:
predictions[predictions['zagats_id']==220]

In [None]:
results[results['zagats_id']==220]

In [None]:
results.head()

In [None]:
df[df['Cluster ID']==0]




In [None]:
df[df['Id'].isin(['534', '219', '221'])]

# New

In [None]:
output_file

In [None]:
df = pd.read_csv(output_file)
df.sort_values(['Cluster ID'], inplace=True)
relevant_data = df[['Cluster ID', 'confidence_score', 'source', 'Id']]

In [None]:
relevant_data.head()

In [None]:
len(relevant_data['Cluster ID'].unique())

In [None]:
len(relevant_data['Cluster ID'].value_counts())

In [None]:
predictions = []
cluster_ids = relevant_data['Cluster ID'].value_counts()

In [None]:
def prepare_matches_file(matches_filepath, source1_name, source2_name):
    results = pd.read_csv('matches_fodors_zagats.csv')
    
    source1_col_name = source1_name + '_id'
    source2_col_name = source2_name + '_id'
    combo_col_name = source1_name[0] + '-' + source2_name[0] 
    
    results[combo_col_name] = results.apply(lambda row: 
                                   f"{row[source1_col_name]}-{row[source2_col_name]}", axis=1)
    return results

def prepare_predictions_file(output_file_path, source1_name, source2_name):
    df = pd.read_csv(output_file)
    df.sort_values(['Cluster ID'], inplace=True)
    relevant_data = df[['Cluster ID', 'confidence_score', 'source', 'Id']]
   

    predictions = []

    cluster_ids = relevant_data['Cluster ID'].value_counts()
    for cluster_id in cluster_ids[cluster_ids>1].index:

        source1_ids = relevant_data[
            (relevant_data['Cluster ID'] == cluster_id) & 
            (relevant_data['source'] == source1_name)
        ].Id.values
        source2_ids = relevant_data[
            (relevant_data['Cluster ID'] == cluster_id) & 
            (relevant_data['source'] == source2_name)
        ].Id.values

        match_interim = list(product(source1_ids, source2_ids))
        predictions.append(match_interim)

    m = []
    for cluster in predictions: 
        for combo in cluster: 
            m.append([combo[0], combo[1]])
     
    print()
    source1_col_name = source1_name + '_id'
    source2_col_name = source2_name + '_id'
    
    combo_col_name = source1_name[0] + '-' + source2_name[0] 
    
    predictions = pd.DataFrame(m, columns=[source1_col_name, source2_col_name])

    predictions[combo_col_name] = predictions.apply(lambda row: 
                                                    f"{row[source1_col_name]}-{row[source2_col_name]}", axis=1)
    return predictions

def calculate_f1_stats(match_set, prediction_set, mismatch_set=False):
    calculate_f1_score = False
    tn = None
    precision = None
    recall = None
    f1 = None
    
    if mismatch_set:
        calculate_f1_score = True
        tn = len(mismatch_set - pred_set)

    
    tp = len(match_set & pred_set)
    fn = len(match_set-pred_set)
    fp = len(pred_set-res_set)
    
    if tp > 0:
        precision = tp/(tp + fp)
    
    if calculate_f1_score:    
        recall = tp/(tp + fn)
        f1 = 2 * ((precision*recall)/(precision+recall)) 
    
    stats = {
    'f1_score': f1,
    'recall': recall,
    'precision': precision,
    'true_positive': tp,
    'true_negative': tn,
    'false_positive': fp,
    'false_negative': fn,
    }
        
    return stats    


def get_f1_stats(predictions_file_path, matches_file_path,
                       source1_name, source2_name):
    
    predictions = prepare_predictions_file(
        output_file_path = predictions_file_path,
        source1_name = source1_name,
        source2_name = source2_name,
    )
    matches = prepare_matches_file(
        matches_filepath = matches_file_path,
        source1_name = source1_name,
        source2_name = source2_name,
    )
    
    match_set = set(matches['f-z'].values.tolist())
    predictions_set = set(predictions['f-z'].values.tolist())
    
    
    stats = calculate_f1_stats(
        match_set = match_set,
        prediction_set = predictions_set
    )
    
    return stats

In [None]:
get_f1_stats(
    predictions_file_path = output_file,
    matches_file_path = matches_filepath,
    source1_name = 'fodors',
    source2_name = 'zagats'
)