In [None]:
from openclean.data.load import dataset
from openclean.pipeline import stream
import pandas as pd

pd.set_option('display.max_rows', None)

data_dir = './project_data/'

data_list = [
   '311_Service_Requests_for_2006.csv',
   '311_Service_Requests_for_2007.csv',
   '311_Service_Requests_for_2009.csv',
   '2021_Open_Data_Plan__Future_Releases.csv',
   'Local_Law_8_of_2020___Complaints_of_Illegal_Parking_of_Vehicles_Operated_on_Behalf_of_the_City.csv',
   'SCOUT_CORE.csv'


]


# Park Borough also included in igkk & p3af

data_column = [
    'Agency Name', 
    'Agency Name', 
    'Agency Name', 
    'Agency Name', 
    'Agency Name',
    'Agency Name'
]



In [None]:

from openclean.cluster.knn import knn_clusters, knn_collision_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

from openclean.function.value.null import is_empty
from openclean.operator.transform.update import update


def calc_effectiveness(problem_rows, cleaned_rows):
    precision = intersected_num / cleaned_rows * 1.0 if cleaned_rows != 0 else 0.0
    recall = intersected_num / problem_rows * 1.0 if problem_rows != 0 else 0.0
    print(f"Data cleaned with precision {precision} and recall {recall} in {intersected_num} cleaning rows")
    return precision, recall

def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

def perform_knn_cluster(ds_full, column, using_collision=True, minsize=2, t=0.6):
    values = ds_full.select(column).distinct()
    clusters = knn_clusters(values=values, sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(t)), minsize=minsize) \
        if using_collision else knn_collision_clusters(values=values, sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(t)), minsize=minsize)
    print('{} clusters of size {} or greater'.format(len(clusters), minsize))
    clusters.sort(key=lambda c: len(c), reverse=True)
    for i, cluster in enumerate(clusters):
        print_cluster(i + 1, cluster)

def profiling_data(datafile, column):
    ds_full = stream(datafile, encoding='utf-8')
    df = ds_full.to_df()
    Agency = df[column].value_counts()
    print(Agency)
    print("Total locations: ", len(Agency))
    return df
    
def cleaning_data_original(df, column):
    rows_affected_upper = df[df[column] != df[column].str.upper()].index
    df = update(df, columns=column, func=str.upper)
    rows_affected_empty = df.isnull().index
    df = update(df, columns=column, func=lambda x: 'OTHER' if is_empty(x) else x)
    cleaned_rows = rows_affected_upper.union(rows_affected_empty)
    return df, cleaned_rows
    
def save_cleaned_data(df, output='result.csv'):
    df.to_csv('output')

In [None]:
#load all the dataset at once then performing data cleaning to create reference data

Agency_df = pd.DataFrame(columns=['Agency Name'])

for i in range(len(data_list)):
    datafile = data_dir + data_list[i]
    print("Load data: ", datafile)
    ds = dataset(datafile, encoding='utf-8')
   
    Agency = ds[['Agency Name']]
    Agency_df = Agency_df.append(Agency)
    
print(Agency_df.value_counts())

In [None]:
# Generate our reference data
Agency_df.to_csv('../reference_data/Agency_rows_all.csv', index=None)

In [None]:
# Load the intermediate dataset as stream format
ds_full = stream('../reference_data/Agency_rows_all.csv', encoding='utf-8')

In [None]:
# Knn cluster
perform_knn_cluster(ds_full, 'Agency Name')

In [None]:
df_overall = ds_full.to_df()

In [None]:
df_clean = update(df_overall, columns='Agency Name', func=str.upper)
df_clean = update(df_clean, columns='Agency Name', func=lambda x: 'UNSPECIFIED' if is_empty(x) else x)
Agency = df_clean['Agency Name'].value_counts()
print(Agency)

In [None]:
# Agency types way too many, We tried to visualize the data but the result is not ideal.
import matplotlib.pyplot as plt
import math

plt.rc('font', size=20)


    
def draw_hist(numbers, groups, datapath = ''):
    plt.barh(range(numbers.count()), [math.log(x) for x in numbers], height=97.7, color='steelblue', alpha=0.1)
    plt.xlabel('counts (log)')
    plt.ylabel('Agency Name')
    plt.yticks(range(groups.count()), groups)
    if datapath != '':
        datapath += '/Agency Name_box.jpg'
        plt.savefig(datapath)
    plt.show()

In [None]:
res = df_clean.value_counts().reset_index()
res.columns = ['Agency Name', 'count']
print(res)

In [None]:
draw_hist(res['count'], res['Agency Name'])