In [1]:
import pandas as pd
import numpy as np
import json

import pickle
from openai import OpenAI

from tenacity import (
                        retry,
                        stop_after_attempt,
                        wait_random_exponential
)

In [2]:
def jaccard_similarity(set1, set2):
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
     
    return intersection / union

In [3]:
def calculate_similarities(sameSTColumns, df_left, df_right):
    similarity = []
    for i in range(sameSTColumns.shape[0]):
        try:
            similarity.append(jaccard_similarity(set(sameSTColumns.iloc[i,4].split(',')), set(sameSTColumns.iloc[i,5].split(','))))
        except:
            similarity.append(jaccard_similarity(set(sameSTColumns.iloc[i,4].split(',')), set(sameSTColumns.iloc[i,5].split(','))))

    return similarity

In [4]:
def generate_possible_joinable_columns(predictions_left, predictions_right, filename_left, filename_right):
    predicted_joinable_columns = []

    for i in range(predictions_left.shape[0]):
        column = predictions_left.iloc[i,0]
        predicted_semantic_type = predictions_left.iloc[i,1]
        for j in range(predictions_right.shape[0]):
            for st in predicted_semantic_type:
                if(st in predictions_right.iloc[j,1]):
                    predicted_joinable_columns.append([filename_left, column,
                                                       filename_right, predictions_right.iloc[j,:].Column, 
                                                       ",".join(predicted_semantic_type), ",".join(predictions_right.iloc[j,:].Predicted)])
                    

    return predicted_joinable_columns

In [5]:
def generate_prompt_predict_domain(table): 
    with open('cta_types_domain_reduced_6domain.json', 'r') as file:
        cta_type_domain = json.load(file)

    system_msg = f"""
            Answer the question based on the task and instructions below. If the question cannot be answered using the information provided answer with "Place".
            Task: Classify the table given to you with only one of the following domains that are separated with comma: {", ".join(cta_type_domain.keys())}.
            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail. 3. Decide if describes a {", ".join(cta_type_domain.keys())}. 4. Answer only with the predicted domain. 
            Example 1: Table: [["Friends Pizza", 2525, Cash Visa MasterCard, 7:30 AM]]
            Domain: Restaurant
            Example 2: Table: [[Museum/Gallery, Vancouver; BC; V6J 2C7, Kitsilano]]
            Domain: Place"""
    
    user_msg = f"""Table: {table.iloc[:30,:].values}
                   Domain: """.strip()
    
    return system_msg, user_msg

In [6]:
def generate_prompt_predict_cta(data_point, domain):
    with open('cta_types_domain_reduced_6domain.json', 'r') as file:
        cta_type_domain = json.load(file)

    system_msg = f"""
            Answer the question based on the task, instructions and examples below. If the question cannot be answered using the information provided answer with "I don't know".
            Task: Classify the text given to you with two of the following classes that are separated with comma: {", ".join(cta_type_domain[domain])}.
            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail.
            Example 1: Column: [Kitsilano, Strathcona, Downtown, UBC, Downtown, Mount Pleasant]
            Label: addressLocality, streetAddress
            Example 2: Column: ['www.memorybc.ca/museum-of-15th-field-artillery-regiment','www.221a.ca/', 'https://www.facebook.com/ACMEstudiobuilding','http://gallery.ahva.ubc.ca/about/','http://www.mozaicoflamenco.com/', 'http://www.anzaclub.org','www.artbeatus.com', 'http://www.artsfactorysociety.ca/']
            Label: URL, Text"""
    
    user_msg = f"""Column: {data_point.values}
                   Label: """.strip()
    
    return system_msg, user_msg

In [7]:
@retry(wait=wait_random_exponential(min=1,max=60), stop=stop_after_attempt(6))
def execute_prompt(client, system_msg, user_msg):
    completion = client.chat.completions.create(
                                            model="gpt-4o",
                                            messages=[
                                                        {
                                                            "role": "system", 
                                                             "content": f"{system_msg}"
                                                        },
                                                        {
                                                            "role": "user",
                                                            "content": f"{user_msg}"
                                                        }
                                                    ]
                                            )
    return completion

In [8]:
def processing_output(prediction, domain):
    with open('cta_types_domain_reduced_6domain.json', 'r') as file:
        cta_type_domain = json.load(file)
    categories = cta_type_domain[domain]

    y_pred = []
    for category in categories:
        if  category.lower() in prediction.lower():
            y_pred.append(category)
            
    if(len(y_pred) == 0):
        y_pred.append("none")

    return y_pred

In [9]:
def generate_predictions(dataframe, client):
    system_msg_predict_domain, user_msg_predict_domain = generate_prompt_predict_domain(dataframe)
    result = execute_prompt(client, system_msg_predict_domain, user_msg_predict_domain)
    domain = result.choices[0].message.content.split('Domain: ')[-1].strip()
    
    X_test = pd.DataFrame(dataframe.iloc[:50].apply(generate_prompt_predict_cta, args=(domain,), axis=0))
    
    y_pred = []
    for i in range(X_test.shape[1]):
        result = execute_prompt(client, X_test.T.iloc[i,0], X_test.T.iloc[i,1])
        prediction = result.choices[0].message.content.split('Label: ')[-1].strip()
        processed_prediction = processing_output(prediction, domain)
        y_pred.append(processed_prediction) 
    
    predictions = pd.DataFrame({
                                "Column":dataframe.columns, 
                                "Predicted":y_pred
                                })
    
    return X_test, y_pred, domain, predictions

In [10]:
client = OpenAI()

In [11]:
df_joinable_columns = pd.read_csv('joinable_columns_90containment.csv')
all_joinable_files = np.concatenate((df_joinable_columns['ds_name'], df_joinable_columns['ds_name_2']), axis=0)
all_joinable_files = np.unique(all_joinable_files)
all_joinable_files

array(['community-centres.csv', 'community-gardens-and-food-trees.csv',
       'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'eo_xx.csv',
       'libraries.csv', 'population-census-of-botswana-2011.csv',
       'public-art-artists.csv', 'public-art.csv',
       'rental-standards-current-issues.csv', 'schools.csv',
       'statewise-census-data-in-india-1901-2011.csv',
       'street-intersections.csv'], dtype=object)

In [12]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv') # informações para carregamento de tabelas do TestBedXS

In [13]:
for i in range(len(all_joinable_files)):
    print(f'Making predictions for table {all_joinable_files[i]}. {i+1} out of {len(all_joinable_files)}')
    info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]                # info de carregamento da tabela atual
    df = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=info['delimiter'].values[0])   # lê tabela
    system_msg_predict_domain, user_msg_predict_domain = generate_prompt_predict_domain(df)

Making predictions for table community-centres.csv. 1 out of 14
Making predictions for table community-gardens-and-food-trees.csv. 2 out of 14
Making predictions for table cultural-spaces.csv. 3 out of 14
Making predictions for table eo4.csv. 4 out of 14
Making predictions for table eo_pr.csv. 5 out of 14
Making predictions for table eo_xx.csv. 6 out of 14
Making predictions for table libraries.csv. 7 out of 14
Making predictions for table population-census-of-botswana-2011.csv. 8 out of 14
Making predictions for table public-art-artists.csv. 9 out of 14
Making predictions for table public-art.csv. 10 out of 14
Making predictions for table rental-standards-current-issues.csv. 11 out of 14
Making predictions for table schools.csv. 12 out of 14
Making predictions for table statewise-census-data-in-india-1901-2011.csv. 13 out of 14
Making predictions for table street-intersections.csv. 14 out of 14


In [14]:
system_msg_predict_domain

'\n            Answer the question based on the task and instructions below. If the question cannot be answered using the information provided answer with "Place".\n            Task: Classify the table given to you with only one of the following domains that are separated with comma: Product, Event, Person, LocalBusiness, Restaurant, Place.\n            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail. 3. Decide if describes a Product, Event, Person, LocalBusiness, Restaurant, Place. 4. Answer only with the predicted domain. \n            Example 1: Table: [["Friends Pizza", 2525, Cash Visa MasterCard, 7:30 AM]]\n            Domain: Restaurant\n            Example 2: Table: [[Museum/Gallery, Vancouver; BC; V6J 2C7, Kitsilano]]\n            Domain: Place'

In [15]:
user_msg_predict_domain

'Table: [[70.0 803.0 \'W 55TH AV AND MACDONALD ST\'\n  \'{"type": "Point", "coordinates": [-123.16830022807757, 49.22135011598763]}\'\n  \'Kerrisdale\']\n [54.0 790.0 \'W 49TH AV AND BALACLAVA ST\'\n  \'{"type": "Point", "coordinates": [-123.17591747979144, 49.22740307286524]}\'\n  \'Kerrisdale\']\n [81.0 790.0 \'W 49TH AV AND YEW ST\'\n  \'{"type": "Point", "coordinates": [-123.15880212518363, 49.22720417487744]}\'\n  \'Kerrisdale\']\n [79.0 794.0 \'VINE ST AND W 51ST AV\'\n  \'{"type": "Point", "coordinates": [-123.16015258021925, 49.22504359024529]}\'\n  \'Kerrisdale\']\n [81.0 794.0 \'W 51ST AV AND YEW ST\'\n  \'{"type": "Point", "coordinates": [-123.15895571008483, 49.22502658719802]}\'\n  \'Kerrisdale\']\n [131.0 616.0 \'GRANVILLE ST AND ROLSTON ST\'\n  \'{"type": "Point", "coordinates": [-123.13049246244279, 49.27435212545923]}\'\n  \'Downtown\']\n [313.0 834.0 \'RIVER DIST. CROSSING AND SAWMILL CRESCENT\'\n  \'{"type": "Point", "coordinates": [-123.03102819886475, 49.2059026932

In [10]:
# ST_predictions_dict = {}

# for i in range(len(all_joinable_files)):
#     print(f'Making predictions for table {all_joinable_files[i]}. {i+1} out of {len(all_joinable_files)}')
#     info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]                # info de carregamento da tabela atual
#     df = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=info['delimiter'].values[0])   # lê tabela
#     X_test, y_pred, domain, predictions = generate_predictions(df, client)                         # gera CTA para colunas da tabela
#     ST_predictions_dict[all_joinable_files[i]] = predictions                                       # salva os CTA num dicionário

#     with open('GPT4o_predictions/GPT4o_predictions_dict.pkl', 'wb') as f: # Grava arquivo com previsões ao fim do processamento da tabela
#         pickle.dump(ST_predictions_dict, f)

Making predictions for table community-centres.csv. 1 out of 14
Making predictions for table community-gardens-and-food-trees.csv. 2 out of 14
Making predictions for table cultural-spaces.csv. 3 out of 14
Making predictions for table eo4.csv. 4 out of 14
Making predictions for table eo_pr.csv. 5 out of 14
Making predictions for table eo_xx.csv. 6 out of 14
Making predictions for table libraries.csv. 7 out of 14
Making predictions for table population-census-of-botswana-2011.csv. 8 out of 14
Making predictions for table public-art-artists.csv. 9 out of 14
Making predictions for table public-art.csv. 10 out of 14
Making predictions for table rental-standards-current-issues.csv. 11 out of 14
Making predictions for table schools.csv. 12 out of 14
Making predictions for table statewise-census-data-in-india-1901-2011.csv. 13 out of 14
Making predictions for table street-intersections.csv. 14 out of 14


In [None]:
files = glob.glob('GPT4o/*')
for file in files:
    with open(file, 'rb') as file:
        ST_predictions_dict = ST_predictions_dict | pickle.load(file)

In [11]:
ST_predictions_dict

{'community-centres.csv':            Column                         Predicted
 0            NAME     [addressLocality, Place/name]
 1         ADDRESS             [streetAddress, Text]
 2         URLLINK                       [Text, URL]
 3            Geom                    [CoordinateAT]
 4  Geo Local Area  [addressLocality, streetAddress],
 'community-gardens-and-food-trees.csv':                               Column                             Predicted
 0                              MAPID                                [none]
 1                       YEAR_CREATED                       [Text, Integer]
 2                               NAME                    [Place/name, Text]
 3                      STREET_NUMBER                                [none]
 4                   STREET_DIRECTION                                [none]
 5                        STREET_NAME      [addressLocality, streetAddress]
 6                        STREET_TYPE                 [streetAddress, Text]
 7      

In [15]:
df_joinable_columns = df_joinable_columns[(df_joinable_columns['ds_name'].isin(all_joinable_files)) & (df_joinable_columns['ds_name_2'].isin(all_joinable_files))]

In [16]:
all_predicted_joinable_columns = pd.DataFrame([])
similarity_calculations = 0
brute_force_calculations = 0

for i in range(len(all_joinable_files)-1):
    left_info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]
    df_left = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=left_info['delimiter'].values[0])
    
    for j in range(i+1, len(all_joinable_files)):
        print(f'Calculating Similarities for tables {all_joinable_files[i]} and {all_joinable_files[j]}.')
        
        right_info = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[j]]
        df_right = pd.read_csv(f'datasets/{all_joinable_files[j]}', delimiter=right_info['delimiter'].values[0])
    
        predictions_left = ST_predictions_dict[all_joinable_files[i]]
        predictions_right = ST_predictions_dict[all_joinable_files[j]]

        predicted_joinable_columns = generate_possible_joinable_columns(predictions_left, predictions_right, all_joinable_files[i], all_joinable_files[j])

        try:
            sameSTColumns = pd.DataFrame(np.array(predicted_joinable_columns), columns=['FilenameLeft', 'ColumnLeft', 
                                                                                        'FilenameRight','ColumnRight',
                                                                                        'SemanticTypeLeft', 'SemanticTypeRight'])
        except ValueError:
            print('No matches found, skipping to next column.')
            continue
            
        similarity = calculate_similarities(sameSTColumns, df_left, df_right)
        sameSTColumns['JaccardSimilarity'] = similarity
        joinableColumns = sameSTColumns[sameSTColumns['JaccardSimilarity'] == 1]

        similarity_calculations += joinableColumns.shape[0]
        brute_force_calculations += df_left.shape[1] * df_right.shape[1]
        
        if(len(joinableColumns) > 0):
            print(f'Adding {joinableColumns.shape[0]} columns')
        
        if(len(all_predicted_joinable_columns) == 0):
            all_predicted_joinable_columns = joinableColumns
        else:
            all_predicted_joinable_columns = pd.concat((all_predicted_joinable_columns, joinableColumns), axis=0)
            print(f'New size {all_predicted_joinable_columns.shape[0]}')

Calculating Similarities for tables community-centres.csv and community-gardens-and-food-trees.csv.
Adding 13 columns
Calculating Similarities for tables community-centres.csv and cultural-spaces.csv.
Adding 5 columns
New size 18
Calculating Similarities for tables community-centres.csv and eo4.csv.
New size 18
Calculating Similarities for tables community-centres.csv and eo_pr.csv.
Adding 2 columns
New size 20
Calculating Similarities for tables community-centres.csv and eo_xx.csv.
New size 20
Calculating Similarities for tables community-centres.csv and libraries.csv.
Adding 9 columns
New size 29
Calculating Similarities for tables community-centres.csv and population-census-of-botswana-2011.csv.
New size 29
Calculating Similarities for tables community-centres.csv and public-art-artists.csv.
Adding 6 columns
New size 35
Calculating Similarities for tables community-centres.csv and public-art.csv.
Adding 11 columns
New size 46
Calculating Similarities for tables community-centres.csv

In [18]:
all_predicted_joinable_columns.head()

Unnamed: 0,FilenameLeft,ColumnLeft,FilenameRight,ColumnRight,SemanticTypeLeft,SemanticTypeRight,JaccardSimilarity
8,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,"streetAddress,Text","streetAddress,Text",1.0
9,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,"streetAddress,Text","streetAddress,Text",1.0
27,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,"Text,URL","Text,URL",1.0
28,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,"Text,URL","Text,URL",1.0
29,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE,"Text,URL","Text,URL",1.0


In [19]:
print(similarity_calculations, brute_force_calculations)

2496 15868


In [20]:
all_predicted_joinable_columns.SemanticTypeLeft.value_counts()

SemanticTypeLeft
Integer                              1388
Place/name,Text                       414
none                                  316
addressLocality,streetAddress         194
Text,URL                               68
CoordinateAT                           36
Text,Integer                           26
URL,Text                               12
Text,LocationFeatureSpecification      10
streetAddress,Text                      6
postalCode,Text                         6
Text                                    6
Text,Photograph                         4
addressLocality,Place/name              2
PostalAddress,postalCode                2
Country,Place/name                      2
PostalAddress,streetAddress             2
Text,AggregateRating                    2
Name: count, dtype: int64

In [21]:
all_predicted_joinable_columns_joins = []
excluding_types = ['Text', 'none', 'Integer', 'Text,Integer']
for i in range(len(all_predicted_joinable_columns)):
    if((all_predicted_joinable_columns.iloc[i,:].SemanticTypeLeft not in excluding_types) and
       (all_predicted_joinable_columns.iloc[i,:].SemanticTypeRight not in excluding_types)):
        all_predicted_joinable_columns_joins.append(';'.join(all_predicted_joinable_columns.iloc[i,:4].values))
all_predicted_joinable_columns_joins = np.array(all_predicted_joinable_columns_joins)
all_predicted_joinable_columns_joins

array(['community-centres.csv;ADDRESS;community-gardens-and-food-trees.csv;STREET_TYPE',
       'community-centres.csv;ADDRESS;community-gardens-and-food-trees.csv;STREET_TYPE',
       'community-centres.csv;URLLINK;community-gardens-and-food-trees.csv;PUBLIC_E_MAIL',
       'community-centres.csv;URLLINK;community-gardens-and-food-trees.csv;PUBLIC_E_MAIL',
       'community-centres.csv;URLLINK;community-gardens-and-food-trees.csv;WEBSITE',
       'community-centres.csv;URLLINK;community-gardens-and-food-trees.csv;WEBSITE',
       'community-centres.csv;Geom;community-gardens-and-food-trees.csv;Geom',
       'community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;STREET_NAME',
       'community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;STREET_NAME',
       'community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;MERGED_ADDRESS',
       'community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;MERGED_ADDRESS',
 

In [23]:
all_predicted_joinable_columns_joins.shape

(760,)

In [24]:
all_predicted_joinable_columns_joins = pd.DataFrame([line.split(';') for line in all_predicted_joinable_columns_joins], columns=['ds_name', 'att_name', 'ds_name_2','att_name_2'])
all_predicted_joinable_columns_joins

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE
1,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE
2,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL
3,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL
4,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE
...,...,...,...,...
755,schools.csv,Geo Local Area,street-intersections.csv,XSTREET
756,schools.csv,Geo Local Area,street-intersections.csv,XSTREET
757,schools.csv,Geo Local Area,street-intersections.csv,Geo Local Area
758,schools.csv,Geo Local Area,street-intersections.csv,Geo Local Area


In [44]:
new_df = pd.concat((all_predicted_joinable_columns_joins.iloc[:,:2],all_predicted_joinable_columns_joins.iloc[:,2:].rename(columns={'ds_name_2':'ds_name','att_name_2':'att_name'})))
unique_columns = np.unique([f'{table_name}#{column_name}' for table_name,column_name in new_df.values])
unique_columns

array(['community-centres.csv#ADDRESS',
       'community-centres.csv#Geo Local Area',
       'community-centres.csv#Geom', 'community-centres.csv#NAME',
       'community-centres.csv#URLLINK',
       'community-gardens-and-food-trees.csv#FOOD_TREE_VARIETIES',
       'community-gardens-and-food-trees.csv#Geo Local Area',
       'community-gardens-and-food-trees.csv#Geom',
       'community-gardens-and-food-trees.csv#JURISDICTION',
       'community-gardens-and-food-trees.csv#MERGED_ADDRESS',
       'community-gardens-and-food-trees.csv#NAME',
       'community-gardens-and-food-trees.csv#PUBLIC_E_MAIL',
       'community-gardens-and-food-trees.csv#STEWARD_OR_MANAGING_ORGANIZATION',
       'community-gardens-and-food-trees.csv#STREET_NAME',
       'community-gardens-and-food-trees.csv#STREET_TYPE',
       'community-gardens-and-food-trees.csv#WEBSITE',
       'cultural-spaces.csv#ADDRESS',
       'cultural-spaces.csv#CULTURAL_SPACE_NAME',
       'cultural-spaces.csv#Geom', 'cultural-spac

In [45]:
# np.save('unique_joinable_columns_GPT4o.npy',unique_columns)

In [27]:
# all_predicted_joinable_columns_joins.to_csv('predicted_joinable_columns_gpt4o.csv', index=False)

In [28]:
len(unique_columns)

79

In [29]:
len(unique_columns)

79

In [30]:
n_columns = 0
for key in ST_predictions_dict.keys():
    n_columns += ST_predictions_dict[key].Column.shape[0]
n_columns

188

In [33]:
df_joinable_columns_joins = []
for i in range(len(df_joinable_columns)):
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,:4].values))
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,2:].values)+';'+';'.join(df_joinable_columns.iloc[i,:2].values))
df_joinable_columns_joins = np.array(df_joinable_columns_joins)
df_joinable_columns_joins = np.unique(df_joinable_columns_joins)
df_joinable_columns_joins

array(['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-centres.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;schools.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;FOOD_TREE_VARIETIES;cultural-spaces.csv;ACTIVE_SPACE',
       'community-gardens-and-food-trees.csv;Geo Local Area;community-centres.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-gardens-and-food-trees.csv;

In [34]:
all_predicted_joinable_columns_joins.shape

(760, 4)

In [35]:
df_joinable_columns_joins.shape

(110,)

In [36]:
tp = 0
fp = 0
fn = 0

for i in range(len(all_predicted_joinable_columns_joins)):
    if(";".join(all_predicted_joinable_columns_joins.iloc[i,:]) in df_joinable_columns_joins):
        tp += 1
    else:
        fp += 1
    fn = len(df_joinable_columns_joins) - tp

precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.11315789473684211 0.7818181818181819 0.19770114942528738


# Generating Descriptions from Filtered Columns

In [37]:
def generate_prompt_joins_from_description(table1, table2):

    system_msg = f"""
            Based on the semantic relationship and data type compatibility of the columns in Table 1 and Table 2, identify the best column pair that 
            would enable a meaningful join between these tables, ensuring that the columns being compared represent related entities in the database 
            schema. 
            """
    
    user_msg = f"""Table 1 columns' descriptions: {table1}
                   Table 2 columns' descriptions: {table2}
                   Potential keys: """.strip()
    
    return system_msg, user_msg

In [38]:
def generate_prompt_column_description(table, column):

    system_msg = f"""
            Describe the semantics of a target column.
            Task: Describe the informations within a column in a given table.
            Instructions: 1. Look at the input given to you. 2. Look at the column values in detail. 3. Describe the target column. 
            """
    
    user_msg = f"""Table columns: {table.columns}
                   Table values:  {table.iloc[:30,:].values}
                   Target column: {column}
                   Description: """.strip()
    
    return system_msg, user_msg

In [39]:
def generate_prompt_description_similarity(c1, c2):

    system_msg = f"""
            Given the semantic description of two columns in a dataset, determine whether the columns are joinable based on their semantics 
            and provide a likelihood score between 0 (not joinable) and 10 (highly joinable). Asnwer only with the likelihood score.
            Example 1:
                Semantic description of column 1: the column represents an ID.
                Semantic description of column 2: the column represents an ID.
                Likelihood Score: 10
            """
    
    user_msg = f"""Semantic description of column 1: {c1}
                   Semantic description of column 2: {c2}
                   Likelihood Score: """.strip()
    
    return system_msg, user_msg

In [40]:
def generate_descriptions(dataframe, column, client):
    system_msg_predict_description, user_msg_predict_description = generate_prompt_column_description(dataframe, column)
    result = execute_prompt(client, system_msg_predict_description, user_msg_predict_description)
    description = result.choices[0].message.content.split('Description: ')[-1].strip()
    
    return description

In [41]:
def generate_likelihood(c1, c2, client):
    system_msg_predict_likelihood, user_msg_predict_likelihood = generate_prompt_description_similarity(c1, c2)
    result = execute_prompt(client, system_msg_predict_likelihood, user_msg_predict_likelihood)
    likelihood = result.choices[0].message.content.split('Likelihood Score: ')[-1].strip()

    return likelihood

In [42]:
unique_columns

array(['community-centres.csv_ADDRESS',
       'community-centres.csv_Geo Local Area',
       'community-centres.csv_Geom', 'community-centres.csv_NAME',
       'community-centres.csv_URLLINK',
       'community-gardens-and-food-trees.csv_FOOD_TREE_VARIETIES',
       'community-gardens-and-food-trees.csv_Geo Local Area',
       'community-gardens-and-food-trees.csv_Geom',
       'community-gardens-and-food-trees.csv_JURISDICTION',
       'community-gardens-and-food-trees.csv_MERGED_ADDRESS',
       'community-gardens-and-food-trees.csv_NAME',
       'community-gardens-and-food-trees.csv_PUBLIC_E_MAIL',
       'community-gardens-and-food-trees.csv_STEWARD_OR_MANAGING_ORGANIZATION',
       'community-gardens-and-food-trees.csv_STREET_NAME',
       'community-gardens-and-food-trees.csv_STREET_TYPE',
       'community-gardens-and-food-trees.csv_WEBSITE',
       'cultural-spaces.csv_ADDRESS',
       'cultural-spaces.csv_CULTURAL_SPACE_NAME',
       'cultural-spaces.csv_Geom', 'cultural-spac

In [47]:
unique_joinable_columns = pd.DataFrame([unique_info.split('#') for unique_info in np.load('unique_joinable_columns_GPT4o.npy')],
                                      columns = ['TABLE','COLUMN'])
unique_joinable_columns.head()

Unnamed: 0,TABLE,COLUMN
0,community-centres.csv,ADDRESS
1,community-centres.csv,Geo Local Area
2,community-centres.csv,Geom
3,community-centres.csv,NAME
4,community-centres.csv,URLLINK


In [48]:
descriptions = []
for i in range(unique_joinable_columns.shape[0]):
    info  = df_dsInformation[df_dsInformation['filename'] == unique_joinable_columns.iloc[i,0]]
    df = pd.read_csv(f'datasets/{unique_joinable_columns.iloc[i,0]}', delimiter=info['delimiter'].values[0])
    description = generate_descriptions(df, unique_joinable_columns.iloc[i,1], client)
    descriptions.append(description)

In [49]:
unique_joinable_columns.insert(2, "DESCRIPTION", descriptions, True)
unique_joinable_columns.head()

Unnamed: 0,TABLE,COLUMN,DESCRIPTION
0,community-centres.csv,ADDRESS,"The ""ADDRESS"" column represents the physical s..."
1,community-centres.csv,Geo Local Area,"The ""Geo Local Area"" column in the table conta..."
2,community-centres.csv,Geom,"The ""Geom"" column in the provided table repres..."
3,community-centres.csv,NAME,"The target column ""NAME"" contains the names of..."
4,community-centres.csv,URLLINK,"The ""URLLINK"" column contains web addresses, s..."


In [50]:
unique_joinable_columns.to_csv('described_predicted_joinable_columns_GPT4o.csv', index=False)

In [51]:
for i in range(unique_joinable_columns.shape[0]):
    print(unique_joinable_columns.TABLE[i], unique_joinable_columns.COLUMN[i])
    print(unique_joinable_columns.DESCRIPTION[i])
    print('------------------------')

community-centres.csv ADDRESS
The "ADDRESS" column represents the physical street addresses of various community centers or parks listed in the dataset. Each value in this column provides specific locational information, typically including the street number, street name, and sometimes other necessary details such as "Street", "Avenue", or "Drive". These addresses are located in Vancouver, and are part of the data entries that also include the name of the community center or park, the URL of their webpage, geographic coordinates, and the associated local area name.

This column is crucial as it provides tangible location details that could be used for navigation, mail services, and identification of the sites in physical space. It supports tasks that require geographic mapping or logistical planning for those looking to visit or deliver services to these sites.
------------------------
community-centres.csv Geo Local Area
The "Geo Local Area" column in the table contains the names of g

In [52]:
unique_joinable_columns.insert(3, "KEY", unique_joinable_columns.TABLE + "#" + unique_joinable_columns.COLUMN, True)
unique_joinable_columns.head()

Unnamed: 0,TABLE,COLUMN,DESCRIPTION,KEY
0,community-centres.csv,ADDRESS,"The ""ADDRESS"" column represents the physical s...",community-centres.csv#ADDRESS
1,community-centres.csv,Geo Local Area,"The ""Geo Local Area"" column in the table conta...",community-centres.csv#Geo Local Area
2,community-centres.csv,Geom,"The ""Geom"" column in the provided table repres...",community-centres.csv#Geom
3,community-centres.csv,NAME,"The target column ""NAME"" contains the names of...",community-centres.csv#NAME
4,community-centres.csv,URLLINK,"The ""URLLINK"" column contains web addresses, s...",community-centres.csv#URLLINK


In [58]:
all_predicted_joinable_columns_joins.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE
1,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE
2,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL
3,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL
4,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE


In [59]:
all_predicted_joinable_columns_joins.insert(4, "LEFT_KEY", all_predicted_joinable_columns_joins.ds_name + "#" + all_predicted_joinable_columns_joins.att_name, True)
all_predicted_joinable_columns_joins.insert(5, "RIGHT_KEY", all_predicted_joinable_columns_joins.ds_name_2 + "#" + all_predicted_joinable_columns_joins.att_name_2, True)
all_predicted_joinable_columns_joins.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,LEFT_KEY,RIGHT_KEY
0,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,community-centres.csv#ADDRESS,community-gardens-and-food-trees.csv#STREET_TYPE
1,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,community-centres.csv#ADDRESS,community-gardens-and-food-trees.csv#STREET_TYPE
2,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#PUBLIC_E_...
3,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#PUBLIC_E_...
4,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#WEBSITE


In [60]:
df_left_description = pd.merge(all_predicted_joinable_columns_joins, unique_joinable_columns, left_on='LEFT_KEY', right_on='KEY')
df_left_description = df_left_description.drop(['TABLE', 'COLUMN', 'KEY'], axis=1)
df_left_description.columns = ['ds_name', 'att_name', 'ds_name_2', 'att_name_2', 'LEFT_KEY', 'RIGHT_KEY', 'LEFT_DESCRIPTION']

df_description = pd.merge(df_left_description, unique_joinable_columns, left_on='RIGHT_KEY', right_on='KEY')
df_description = df_description.drop(['TABLE', 'COLUMN', 'KEY'], axis=1)
df_description.columns = ['ds_name', 'att_name', 'ds_name_2', 'att_name_2', 'LEFT_KEY', 'RIGHT_KEY', 'LEFT_DESCRIPTION', 'RIGHT_DESCRIPTION']

df_description.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,LEFT_KEY,RIGHT_KEY,LEFT_DESCRIPTION,RIGHT_DESCRIPTION
0,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,community-centres.csv#ADDRESS,community-gardens-and-food-trees.csv#STREET_TYPE,"The ""ADDRESS"" column represents the physical s...","The ""STREET_TYPE"" column in this table appears..."
1,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,community-centres.csv#ADDRESS,community-gardens-and-food-trees.csv#STREET_TYPE,"The ""ADDRESS"" column represents the physical s...","The ""STREET_TYPE"" column in this table appears..."
2,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#PUBLIC_E_...,"The ""URLLINK"" column contains web addresses, s...",The `PUBLIC_E_MAIL` column in the table is des...
3,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#PUBLIC_E_...,"The ""URLLINK"" column contains web addresses, s...",The `PUBLIC_E_MAIL` column in the table is des...
4,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#WEBSITE,"The ""URLLINK"" column contains web addresses, s...","The ""WEBSITE"" column in the table contains URL..."


In [61]:
likelihoods = []
for i in range(df_description.shape[0]):
    print(f'Description {i+1} out of {df_description.shape[0]}.', end='\r')
    left_description = df_description.iloc[i, 6]
    right_description = df_description.iloc[i, 7]

    likelihood = generate_likelihood(left_description, right_description, client)

    likelihoods.append(likelihood)

Description 760 out of 760.

In [62]:
df_description.insert(8, "LIKELIHOOD", likelihoods, True)
df_description.head()

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2,LEFT_KEY,RIGHT_KEY,LEFT_DESCRIPTION,RIGHT_DESCRIPTION,LIKELIHOOD
0,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,community-centres.csv#ADDRESS,community-gardens-and-food-trees.csv#STREET_TYPE,"The ""ADDRESS"" column represents the physical s...","The ""STREET_TYPE"" column in this table appears...",6
1,community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE,community-centres.csv#ADDRESS,community-gardens-and-food-trees.csv#STREET_TYPE,"The ""ADDRESS"" column represents the physical s...","The ""STREET_TYPE"" column in this table appears...",3
2,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#PUBLIC_E_...,"The ""URLLINK"" column contains web addresses, s...",The `PUBLIC_E_MAIL` column in the table is des...,0
3,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#PUBLIC_E_...,"The ""URLLINK"" column contains web addresses, s...",The `PUBLIC_E_MAIL` column in the table is des...,0
4,community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE,community-centres.csv#URLLINK,community-gardens-and-food-trees.csv#WEBSITE,"The ""URLLINK"" column contains web addresses, s...","The ""WEBSITE"" column in the table contains URL...",1


In [63]:
for i in range(df_description.shape[0]):
    print(",".join(df_description.iloc[i,:4].values), df_description.iloc[i,8])

community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE 6
community-centres.csv,ADDRESS,community-gardens-and-food-trees.csv,STREET_TYPE 3
community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL 0
community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,PUBLIC_E_MAIL 0
community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE 1
community-centres.csv,URLLINK,community-gardens-and-food-trees.csv,WEBSITE 2
community-centres.csv,Geom,community-gardens-and-food-trees.csv,Geom 10
community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,STREET_NAME 2
community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,STREET_NAME 0
community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,MERGED_ADDRESS 0
community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,MERGED_ADDRESS 0
community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area

In [64]:
for i in range(df_description.shape[0]):
    if(float(df_description.iloc[i, 8]) == 10):
        print(",".join(df_description.iloc[i,:4].values), df_description.iloc[i,8])

community-centres.csv,Geom,community-gardens-and-food-trees.csv,Geom 10
community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area 10
community-centres.csv,Geom,cultural-spaces.csv,Geom 10
community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA 10
community-centres.csv,Geo Local Area,cultural-spaces.csv,LOCAL_AREA 10
community-centres.csv,Geom,libraries.csv,Geom 10
community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area 10
community-centres.csv,Geo Local Area,public-art.csv,Neighbourhood 10
community-centres.csv,Geo Local Area,public-art.csv,Neighbourhood 10
community-centres.csv,Geo Local Area,public-art.csv,GeoLocalArea 10
community-centres.csv,Geo Local Area,public-art.csv,GeoLocalArea 10
community-centres.csv,Geo Local Area,rental-standards-current-issues.csv,Geo Local Are

In [65]:
df_description.to_csv('likelihood_described_predicted_joinable_columns_GPT4o.csv',index=False)

In [81]:
joinable_columns_from_df_description = []
for i in range(df_description[df_description.LIKELIHOOD == "10"].shape[0]):
    joinable_columns_from_df_description.append(";".join(df_description[df_description.LIKELIHOOD == "10"].iloc[i, :4].values))
joinable_columns_from_df_description

['community-centres.csv;Geom;community-gardens-and-food-trees.csv;Geom',
 'community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
 'community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
 'community-centres.csv;Geom;cultural-spaces.csv;Geom',
 'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
 'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
 'community-centres.csv;Geom;libraries.csv;Geom',
 'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
 'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
 'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
 'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
 'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
 'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
 'community-centres.csv;Geo Local Area;rental-standards-current-issues.cs

In [84]:
joinable_columns_from_df_description = np.array(joinable_columns_from_df_description)
joinable_columns_from_df_description = np.unique(joinable_columns_from_df_description)
joinable_columns_from_df_description

array(['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-centres.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;schools.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
       'community-centres.csv;Geom;community-gardens-and-food-trees.csv;Geom',
       'community-centres.csv;Geom;cultural-spaces.csv;Geom',
       'community-centres.csv;Geom;libraries.csv;Geom',
       'community-centres.csv;Geom;street-intersections.csv;Geom',
       'community-gardens-and-food-trees.csv;Geo Local Area;cultural-spac

In [78]:
df_joinable_columns_joins

array(['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-centres.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;schools.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;FOOD_TREE_VARIETIES;cultural-spaces.csv;ACTIVE_SPACE',
       'community-gardens-and-food-trees.csv;Geo Local Area;community-centres.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-gardens-and-food-trees.csv;

In [90]:
tp = 0
list_tp = []

fp = 0
list_fp = []

fn = 0
list_fn = []

for i in range(len(joinable_columns_from_df_description)):
    if(joinable_columns_from_df_description[i] in df_joinable_columns_joins):
        tp += 1
        list_tp.append(joinable_columns_from_df_description[i])
    else:
        fp += 1
        list_fp.append(joinable_columns_from_df_description[i])
fn = len(df_joinable_columns_joins) - tp

for i in range(len(df_joinable_columns_joins)):
    if(df_joinable_columns_joins[i] not in joinable_columns_from_df_description):
        list_fn.append(df_joinable_columns_joins[i])

precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.5573770491803278 0.3090909090909091 0.39766081871345027


In [87]:
tp

34

In [91]:
list_tp

['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
 'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
 'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
 'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
 'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
 'community-centres.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
 'community-centres.csv;Geo Local Area;schools.csv;Geo Local Area',
 'community-centres.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
 'community-gardens-and-food-trees.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
 'community-gardens-and-food-trees.csv;Geo Local Area;libraries.csv;Geo Local Area',
 'community-gardens-and-food-trees.csv;Geo Local Area;public-art.csv;GeoLocalArea',
 'community-gardens-and-food-trees.csv;Geo Local Area;public-art.csv;Neighbourhood',
 'community-gardens-and-food-trees.csv;Geo L

In [88]:
fp

27

In [92]:
list_fp

['community-centres.csv;Geom;community-gardens-and-food-trees.csv;Geom',
 'community-centres.csv;Geom;cultural-spaces.csv;Geom',
 'community-centres.csv;Geom;libraries.csv;Geom',
 'community-centres.csv;Geom;street-intersections.csv;Geom',
 'community-gardens-and-food-trees.csv;Geom;cultural-spaces.csv;Geom',
 'community-gardens-and-food-trees.csv;Geom;libraries.csv;Geom',
 'community-gardens-and-food-trees.csv;Geom;public-art.csv;Geom',
 'community-gardens-and-food-trees.csv;Geom;rental-standards-current-issues.csv;Geom',
 'community-gardens-and-food-trees.csv;Geom;street-intersections.csv;Geom',
 'cultural-spaces.csv;Geom;libraries.csv;Geom',
 'cultural-spaces.csv;Geom;public-art.csv;Geom',
 'cultural-spaces.csv;Geom;rental-standards-current-issues.csv;Geom',
 'cultural-spaces.csv;Geom;schools.csv;Geom',
 'cultural-spaces.csv;Geom;street-intersections.csv;Geom',
 'eo4.csv;ICO;eo_xx.csv;NAME',
 'eo_pr.csv;ICO;eo_xx.csv;ICO',
 'eo_pr.csv;NAME;eo_xx.csv;NAME',
 'eo_pr.csv;SORT_NAME;eo_x

In [89]:
fn

76

In [93]:
list_fn

['community-gardens-and-food-trees.csv;FOOD_TREE_VARIETIES;cultural-spaces.csv;ACTIVE_SPACE',
 'community-gardens-and-food-trees.csv;Geo Local Area;community-centres.csv;Geo Local Area',
 'community-gardens-and-food-trees.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
 'community-gardens-and-food-trees.csv;NUMBER_OF_FOOD_TREES;cultural-spaces.csv;ACTIVE_SPACE',
 'cultural-spaces.csv;ACTIVE_SPACE;community-gardens-and-food-trees.csv;FOOD_TREE_VARIETIES',
 'cultural-spaces.csv;ACTIVE_SPACE;community-gardens-and-food-trees.csv;NUMBER_OF_FOOD_TREES',
 'cultural-spaces.csv;LOCAL_AREA;community-centres.csv;Geo Local Area',
 'cultural-spaces.csv;LOCAL_AREA;community-gardens-and-food-trees.csv;Geo Local Area',
 'cultural-spaces.csv;LOCAL_AREA;street-intersections.csv;Geo Local Area',
 'eo4.csv;CITY;eo_pr.csv;CITY',
 'eo4.csv;CITY;public-art-artists.csv;Country',
 'eo4.csv;NAME;eo_pr.csv;NAME',
 'eo4.csv;NTEE_CD;eo_pr.csv;NTEE_CD',
 'eo4.csv;NTEE_CD;eo_xx.csv;NTEE_CD',
 'eo4.csv;S