In [1]:
import pandas as pd
import numpy as np

from utils import (generate_prompt, 
                   generate_test_prompt, 
                    find_all_linear_names, 
                    predict_old, 
                    evaluate, 
                    load_pretrained_model, 
                    generate_train_val_data,
                    initiate_trainer,
                    initiate_base_model)

In [2]:
def correcting_semantic_type(semantic_type):
    return semantic_type.replace(',',';')

In [3]:
def generate_test_prompt_NextiaJD_few_shot(data_point):
    cta_types = np.load('cta_types.npy', allow_pickle=True)
    return f"""
            Answer the question based on the task, instructions and examples below. If the question cannot be answered using the information provided answer with "I don't know".
            Task: Classify the text given to you with only one of these classes that are separated with comma: {", ".join(cta_types)}.
            Instructions: 1. Look at the input given to you. 2. Look at the cell values in detail.
            Example 1: Column: [Kitsilano, Strathcona, Downtown, UBC, Downtown, Mount Pleasant]
            label: addressLoc
            Example 2: Column: ['www.memorybc.ca/museum-of-15th-field-artillery-regiment','www.221a.ca/', 'https://www.facebook.com/ACMEstudiobuilding','http://gallery.ahva.ubc.ca/about/','http://www.mozaicoflamenco.com/', 'http://www.anzaclub.org','www.artbeatus.com', 'http://www.artsfactorysociety.ca/']
            label: URL
            Column: {data_point[:50].values}
label: """.strip()

In [4]:
def jaccard_similarity(set1, set2):
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
     
    return intersection / union

In [14]:
def generate_predictions(dataframe, model, tokenizer):
    X_test = pd.DataFrame(dataframe.iloc[:10].apply(generate_test_prompt_NextiaJD_few_shot, axis=0), columns=["prediction"])
    y_pred = predict_old(X_test, model, tokenizer)
    predictions = pd.DataFrame({
                                "Column":dataframe.columns, 
                                "Predicted":y_pred
                                })
    return predictions

In [6]:
def generate_possible_joinable_columns(predictions_left, predictions_right, filename_left, filename_right):
    predicted_joinable_columns = []

    for i in range(predictions_left.shape[0]):
        column = predictions_left.iloc[i,0]
        predicted_semantic_type = predictions_left.iloc[i,1]
        for j in range(predictions_right.shape[0]):
            if(predicted_semantic_type == predictions_right.iloc[j,1]):
                predicted_joinable_columns.append([filename_left, column,
                                                   filename_right, predictions_right.iloc[j,:].Column, 
                                                   predicted_semantic_type])

    return predicted_joinable_columns

In [7]:
def calculate_similarities(sameSTColumns, df_left, df_right):
    similarity = []
    for i in range(sameSTColumns.shape[0]):
        try:
            similarity.append(jaccard_similarity(set(df_left[sameSTColumns.iloc[i,1]].unique()), set(df_right[sameSTColumns.iloc[i,3]].unique())))
        except:
            similarity.append(jaccard_similarity(set(df_right[sameSTColumns.iloc[i,1]].unique()), set(df_left[sameSTColumns.iloc[i,3]].unique())))

    return similarity

In [8]:
df_joinable_columns = pd.read_csv('joinable_columns_75containment.csv')
all_joinable_files = np.concatenate((df_joinable_columns['ds_name'], df_joinable_columns['ds_name_2']), axis=0)
all_joinable_files = np.unique(all_joinable_files)
all_joinable_files

array(['community-centres.csv', 'community-gardens-and-food-trees.csv',
       'cultural-spaces.csv', 'eo4.csv', 'eo_pr.csv', 'eo_xx.csv',
       'libraries.csv', 'population-census-of-botswana-2011.csv',
       'public-art-artists.csv', 'public-art.csv',
       'rental-standards-current-issues.csv', 'schools.csv',
       'statewise-census-data-in-india-1901-2011.csv',
       'street-intersections.csv'], dtype=object)

In [9]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [10]:
base_model_name = "meta-llama/Meta-Llama-3.1-8B"
model, tokenizer = initiate_base_model(base_model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [11]:
modules = find_all_linear_names(model)

In [12]:
semantic_types = np.load('cta_types.npy', allow_pickle=True)
semantic_types = np.unique(np.array(list(map(correcting_semantic_type,semantic_types))))

In [15]:
ST_predictions_dict = {}
for i in range(len(all_joinable_files)):
    print(f'Making predictions for table {all_joinable_files[i]}. {i+1} out of {len(all_joinable_files)}', end='\r')
    info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]
    df = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=info['delimiter'].values[0])
    prediction = generate_predictions(df, model, tokenizer)
    ST_predictions_dict[all_joinable_files[i]] = prediction

Making predictions for table community-centres.csv. 1 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  2.34it/s]


Making predictions for table community-gardens-and-food-trees.csv. 2 out of 14

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:05<00:00,  3.26it/s]


Making predictions for table cultural-spaces.csv. 3 out of 14

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:03<00:00,  3.14it/s]


Making predictions for table eo4.csv. 4 out of 14

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:08<00:00,  3.28it/s]


Making predictions for table eo_pr.csv. 5 out of 14

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:08<00:00,  3.27it/s]


Making predictions for table eo_xx.csv. 6 out of 14

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [00:08<00:00,  3.26it/s]


Making predictions for table libraries.csv. 7 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.06it/s]


Making predictions for table population-census-of-botswana-2011.csv. 8 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:02<00:00,  3.27it/s]


Making predictions for table public-art-artists.csv. 9 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:03<00:00,  2.66it/s]


Making predictions for table public-art.csv. 10 out of 14

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [00:06<00:00,  3.02it/s]


Making predictions for table rental-standards-current-issues.csv. 11 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:02<00:00,  3.01it/s]


Making predictions for table schools.csv. 12 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.04it/s]


Making predictions for table statewise-census-data-in-india-1901-2011.csv. 13 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:02<00:00,  3.24it/s]


Making predictions for table street-intersections.csv. 14 out of 14

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.02it/s]


In [16]:
ST_predictions_dict

{'community-centres.csv':            Column      Predicted
 0            NAME          Place
 1         ADDRESS  streetAddress
 2         URLLINK            URL
 3            Geom       GeoShape
 4  Geo Local Area     addressLoc,
 'community-gardens-and-food-trees.csv':                               Column       Predicted
 0                              MAPID  identifierName
 1                       YEAR_CREATED      typicalAge
 2                               NAME            Text
 3                      STREET_NUMBER           price
 4                   STREET_DIRECTION      GenderType
 5                        STREET_NAME            Text
 6                        STREET_TYPE   streetAddress
 7                     MERGED_ADDRESS   streetAddress
 8                    NUMBER_OF_PLOTS         Integer
 9               NUMBER_OF_FOOD_TREES         Integer
 10                             NOTES            Text
 11               FOOD_TREE_VARIETIES            Text
 12                 OTHER_FO

In [17]:
all_joinable_columns = pd.DataFrame([])
similarity_calculations = 0
brute_force_calculations = 0

for i in range(len(all_joinable_files)-1):
    left_info  = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[i]]
    df_left = pd.read_csv(f'datasets/{all_joinable_files[i]}', delimiter=left_info['delimiter'].values[0])
    
    for j in range(i+1, len(all_joinable_files)):
        print(f'Calculating Similarities for tables {all_joinable_files[i]} and {all_joinable_files[j]}.')
        
        right_info = df_dsInformation[df_dsInformation['filename'] == all_joinable_files[j]]
        df_right = pd.read_csv(f'datasets/{all_joinable_files[j]}', delimiter=right_info['delimiter'].values[0])
    
        predictions_left = ST_predictions_dict[all_joinable_files[i]]
        predictions_right = ST_predictions_dict[all_joinable_files[j]]
    
        predicted_joinable_columns = generate_possible_joinable_columns(predictions_left, predictions_right, all_joinable_files[i], all_joinable_files[j])
    
        try:
            sameSTColumns = pd.DataFrame(np.array(predicted_joinable_columns), columns=['FilenameLeft', 'ColumnLeft', 
                                                                                        'FilenameRight','ColumnRight',
                                                                                        'SemanticType'])
        except ValueError:
            print('No matches found, skipping to next column.')
            continue
            
        similarity = calculate_similarities(sameSTColumns, df_left, df_right)
        sameSTColumns['JaccardSimilarity'] = similarity
        joinableColumns = sameSTColumns[sameSTColumns['JaccardSimilarity'] >= 0.75]

        similarity_calculations += sameSTColumns.shape[0]
        brute_force_calculations += df_left.shape[1] * df_right.shape[1]
        
        if(len(joinableColumns) > 0):
            print(f'Adding {joinableColumns.shape[0]} columns')
        
        if(len(all_joinable_columns) == 0):
            all_joinable_columns = joinableColumns
        else:
            all_joinable_columns = pd.concat((all_joinable_columns, joinableColumns), axis=0)
            print(f'New size {all_joinable_columns.shape[0]}')

Calculating Similarities for tables community-centres.csv and community-gardens-and-food-trees.csv.
Adding 1 columns
Calculating Similarities for tables community-centres.csv and cultural-spaces.csv.
New size 1
Calculating Similarities for tables community-centres.csv and eo4.csv.
No matches found, skipping to next column.
Calculating Similarities for tables community-centres.csv and eo_pr.csv.
New size 1
Calculating Similarities for tables community-centres.csv and eo_xx.csv.
No matches found, skipping to next column.
Calculating Similarities for tables community-centres.csv and libraries.csv.
Adding 1 columns
New size 2
Calculating Similarities for tables community-centres.csv and population-census-of-botswana-2011.csv.
No matches found, skipping to next column.
Calculating Similarities for tables community-centres.csv and public-art-artists.csv.
New size 2
Calculating Similarities for tables community-centres.csv and public-art.csv.
Adding 1 columns
New size 3
Calculating Similariti

In [18]:
print(similarity_calculations, brute_force_calculations)

1201 14677


In [19]:
all_joinable_columns.shape

(55, 6)

In [20]:
# for i in range(len(all_joinable_files)-1):   
#     for j in range(i+1, len(all_joinable_files)):
        

SyntaxError: incomplete input (1013818343.py, line 3)

In [21]:
all_joinable_columns

Unnamed: 0,FilenameLeft,ColumnLeft,FilenameRight,ColumnRight,SemanticType,JaccardSimilarity
4,community-centres.csv,Geo Local Area,community-gardens-and-food-trees.csv,Geo Local Area,addressLoc,0.782609
4,community-centres.csv,Geo Local Area,libraries.csv,Geo Local Area,addressLoc,0.85
6,community-centres.csv,Geo Local Area,public-art.csv,GeoLocalArea,addressLoc,0.772727
3,community-centres.csv,Geo Local Area,rental-standards-current-issues.csv,Geo Local Area,addressLoc,0.818182
2,community-centres.csv,Geo Local Area,schools.csv,Geo Local Area,addressLoc,0.818182
2,community-centres.csv,Geo Local Area,street-intersections.csv,Geo Local Area,addressLoc,0.782609
4,community-gardens-and-food-trees.csv,Geo Local Area,libraries.csv,Geo Local Area,addressLoc,0.826087
44,community-gardens-and-food-trees.csv,Geo Local Area,public-art.csv,Neighbourhood,addressLoc,0.777778
45,community-gardens-and-food-trees.csv,Geo Local Area,public-art.csv,GeoLocalArea,addressLoc,0.913043
9,community-gardens-and-food-trees.csv,Geo Local Area,rental-standards-current-issues.csv,Geo Local Area,addressLoc,0.956522


In [22]:
all_joinable_columns.shape

(55, 6)

In [23]:
df_joinable_columns

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
0,eo_pr.csv,NAME,eo4.csv,NAME
1,eo_pr.csv,ICO,eo4.csv,ICO
2,eo_pr.csv,STREET,eo4.csv,STREET
3,eo_pr.csv,CITY,eo4.csv,CITY
4,eo_pr.csv,STATE,eo4.csv,STATE
...,...,...,...,...
87,eo4.csv,STATE,eo_xx.csv,STATE
88,eo_xx.csv,ZIP,eo4.csv,ZIP
89,eo_xx.csv,NTEE_CD,eo4.csv,NTEE_CD
90,eo_xx.csv,SORT_NAME,eo4.csv,SORT_NAME


In [24]:
df_joinable_columns[df_joinable_columns['ds_name'] == 'cultural-spaces.csv']

Unnamed: 0,ds_name,att_name,ds_name_2,att_name_2
8,cultural-spaces.csv,LOCAL_AREA,street-intersections.csv,Geo Local Area
10,cultural-spaces.csv,LOCAL_AREA,rental-standards-current-issues.csv,Geo Local Area
12,cultural-spaces.csv,LOCAL_AREA,public-art.csv,Neighbourhood
14,cultural-spaces.csv,LOCAL_AREA,public-art.csv,GeoLocalArea
16,cultural-spaces.csv,LOCAL_AREA,schools.csv,Geo Local Area
18,cultural-spaces.csv,LOCAL_AREA,community-gardens-and-food-trees.csv,Geo Local Area
20,cultural-spaces.csv,ACTIVE_SPACE,community-gardens-and-food-trees.csv,NUMBER_OF_FOOD_TREES
21,cultural-spaces.csv,ACTIVE_SPACE,community-gardens-and-food-trees.csv,FOOD_TREE_VARIETIES
22,cultural-spaces.csv,LOCAL_AREA,libraries.csv,Geo Local Area
24,cultural-spaces.csv,LOCAL_AREA,community-centres.csv,Geo Local Area


In [25]:
all_joinable_columns[all_joinable_columns['FilenameRight'] == 'cultural-spaces.csv']

Unnamed: 0,FilenameLeft,ColumnLeft,FilenameRight,ColumnRight,SemanticType,JaccardSimilarity


In [26]:
all_joinable_columns_joins = []
for i in range(len(all_joinable_columns)):
    all_joinable_columns_joins.append(';'.join(all_joinable_columns.iloc[i,:4].values))
all_joinable_columns_joins = np.array(all_joinable_columns_joins)
all_joinable_columns_joins

array(['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-centres.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;schools.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-gardens-and-food-trees.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-gardens-and-food-trees.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;schools.csv;Geo Local Area',
       'community

In [27]:
df_joinable_columns_joins = []
for i in range(len(df_joinable_columns)):
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,:4].values))
    df_joinable_columns_joins.append(';'.join(df_joinable_columns.iloc[i,2:].values)+';'+';'.join(df_joinable_columns.iloc[i,:2].values))
df_joinable_columns_joins = np.array(df_joinable_columns_joins)
df_joinable_columns_joins = np.unique(df_joinable_columns_joins)
df_joinable_columns_joins

array(['community-centres.csv;Geo Local Area;community-gardens-and-food-trees.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-centres.csv;Geo Local Area;libraries.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;public-art.csv;GeoLocalArea',
       'community-centres.csv;Geo Local Area;public-art.csv;Neighbourhood',
       'community-centres.csv;Geo Local Area;rental-standards-current-issues.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;schools.csv;Geo Local Area',
       'community-centres.csv;Geo Local Area;street-intersections.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;FOOD_TREE_VARIETIES;cultural-spaces.csv;ACTIVE_SPACE',
       'community-gardens-and-food-trees.csv;Geo Local Area;community-centres.csv;Geo Local Area',
       'community-gardens-and-food-trees.csv;Geo Local Area;cultural-spaces.csv;LOCAL_AREA',
       'community-gardens-and-food-trees.csv;

In [28]:
all_joinable_columns_joins.shape

(55,)

In [29]:
df_joinable_columns_joins.shape

(112,)

In [30]:
jaccard_similarity(set(all_joinable_columns_joins), set(df_joinable_columns_joins))

0.1597222222222222

In [31]:
tp = 0
fp = 0
fn = 0

for i in range(len(all_joinable_columns_joins)):
    if(all_joinable_columns_joins[i] in df_joinable_columns_joins):
        tp += 1
    else:
        fp += 1
    fn = len(df_joinable_columns_joins) - (tp+fp)

precision = (tp)/(tp+fp)
recall = (tp)/(tp+fn)
f1_score = 2 * (precision*recall)/(precision+recall)

print(precision, recall, f1_score)

0.41818181818181815 0.2875 0.34074074074074073


In [32]:
ST_predictions_dict['eo4.csv']

Unnamed: 0,Column,Predicted
0,EIN,Integer
1,NAME,Text
2,ICO,Person
3,STREET,postalCode
4,CITY,Country
5,STATE,Product/name
6,ZIP,identifierName
7,GROUP,Integer
8,SUBSECTION,Integer
9,AFFILIATION,Integer


In [33]:
ST_predictions_dict['eo_pr.csv']

Unnamed: 0,Column,Predicted
0,EIN,Integer
1,NAME,Organization
2,ICO,postalCode
3,STREET,postalCode
4,CITY,addressLoc
5,STATE,Product/name
6,ZIP,telephone
7,GROUP,Integer
8,SUBSECTION,Integer
9,AFFILIATION,Integer


In [34]:
ST_predictions_dict['cultural-spaces.csv']

Unnamed: 0,Column,Predicted
0,YEAR,DateTime
1,CULTURAL_SPACE_NAME,Text
2,WEBSITE,URL
3,TYPE,category
4,PRIMARY_USE,category
5,ADDRESS,addressLoc
6,LOCAL_AREA,Text
7,OWNERSHIP,ItemAvailability
8,SQUARE_FEET,price
9,NUMBER_OF_SEATS,price


In [35]:
semantic_types

array(['Action', 'AdministrativeArea; GeoShape; Place', 'AggregateRating',
       'Book/name', 'BookFormatType', 'Boolean', 'Boolean; Text',
       'Boolean; URL', 'Brand', 'CategoryCode', 'ContactPoint',
       'CoordinateAT', 'Country', 'CreativeWork', 'CreativeWork/name',
       'CreativeWork; ItemList', 'CreativeWork; URL',
       'CreativeWorkSeason', 'CreativeWorkSeries', 'Date',
       'Date; DateTime', 'Date; DateTime; Time', 'DateTime; Time',
       'DayOfWeek', 'DeliveryMethod', 'Distance', 'Duration',
       'EducationalOccupationalCredential',
       'EducationalOrganization; Organization', 'Energy', 'Event/name',
       'EventAttendanceModeEnumeration', 'EventStatusType', 'GenderType',
       'GeoCoordinates; GeoShape', 'GeoShape; Place', 'Hotel/name',
       'IdentifierAT', 'Integer', 'Integer; Text', 'InteractionCounter',
       'ItemAvailability', 'ItemList; MusicRecording', 'JobPosting/name',
       'Language', 'LoanOrCredit; PaymentMethod', 'LocalBusiness/name',
     