In [1]:
import numpy as np
import pandas as pd
from transformers import pipeline
                         
from huggingface_hub import notebook_login

from utils import (find_all_linear_names,
                    initiate_base_model)

import os
import warnings
import glob
import json

from tqdm import tqdm

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import seaborn as sns

os.environ["WANDB_DISABLED"] = "true"
warnings.filterwarnings("ignore")

In [2]:
path = '/home/manoelflorencio/cta_for_jd/testbedXS'
os.chdir(path)
print(os.getcwd())

/home/manoelflorencio/cta_for_jd/testbedXS


# Import Model

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
base_model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model, tokenizer = initiate_base_model(base_model_name)

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [5]:
modules = find_all_linear_names(model)

In [77]:
def generate_prompt_predict_possible_joins(target_description, candidate_descriptions, candidate_tables):

    system_msg = f"""
            Given one target column description and many candidate column descriptions, predict all the pairs (candidate table name, candidate 
            description column name) that could be joined, answer only with the pairs without including reasoning or extra info.

            Task: Look carefully at the target column description and candidate column descriptions and use this information to identify 
            patterns and relationships between the descriptions, the result must be a list of all the JOINable pairs found. If no joinable pair is 
            found the result should be just the word "none".

            Additional info: A JOIN in relational databases is an operation that retrieves related rows from two tables by linking them 
            based on related columns between them.
            
            Instructions: 
                1. Look at the target description given to you. 
                2. Look at the candidate descriptions in detail. 
                3. Predict if the target column description belongs to a column that may be used in join. 
                4. Select all the highly likely JOINs between these candidate columns based only on these descriptions and the similarities between their semantics. Disregard the column names.

            Example 1:
                Target description: this column represents a worker's id
                Candidate tables: ['salary.csv','salary.csv','hospital.csv']
                Candidate description: ['the column worker_id represents the worker's id', 'this column represents a worker's salary', 'this column represents a hospital location']
                Possible JOINs: ('salary.csv', 'worker_id')
                """
            
    user_msg = f""" Target description:      {target_description}
                   Candidate table:         {candidate_tables.values}
                   Candidate descriptions:  {candidate_descriptions.values}
                   Possible JOINs: """.strip()

    messages=[
                {
                    "role": "system", 
                     "content": f"{system_msg}"
                },
                {
                    "role": "user",
                    "content": f"{user_msg}"
                }
            ]

    return messages

In [62]:
def predict(table, prompt, model, tokenizer, split_key):
 
    pipe = pipeline(task="text-generation",
                    model=model,
                    tokenizer=tokenizer,
                    max_new_tokens=50,
                    temperature=0.1)

    result = pipe(prompt)
    answer = result[0]['generated_text'][-1]['content']

    return answer

In [9]:
descriptions = pd.read_csv('Description_test/all_descriptions.csv')
descriptions.head()

Unnamed: 0,TableName,Column,Description
0,statewise-census-data-in-india-1901-2011.csv,FREQUENCY,The 'FREQUENCY' column consists of the tempora...
1,statewise-census-data-in-india-1901-2011.csv,DATE,The 'DATE' column in the table represents the ...
2,statewise-census-data-in-india-1901-2011.csv,LOCATION_NAME,"The ""LOCATION_NAME"" column in the table repres..."
3,statewise-census-data-in-india-1901-2011.csv,LOCATION_F5,"The column ""LOCATION_F5"" in the table appears ..."
4,statewise-census-data-in-india-1901-2011.csv,LOCATION_ID,"The ""LOCATION_ID"" column contains identifiers ..."


In [10]:
# files = ['eo_pr.csv', 'cultural-spaces.csv', 'public-art.csv', 'libraries.csv', 'schools.csv']
files = [file.split('/')[-1] for file in glob.glob('datasets/*')]
files

['statewise-census-data-in-india-1901-2011.csv',
 'road-ahead-current-road-closures.csv',
 'property-tie-lines.csv',
 'public-art.csv',
 'gvrd-sewer-trunk-mains.csv',
 'SCS_Staff_Salaries_data_30th_June 2010.csv',
 'schools.csv',
 'rental-standards-current-issues.csv',
 'datasets_579296_1047868_authors.csv',
 'survey_results_schema.csv',
 'animal-control-inventory-lost-and-found.csv',
 'glassdoor_wwfu_val_captions.csv',
 'eo_xx.csv',
 'community-gardens-and-food-trees.csv',
 'road-ahead-upcoming-projects.csv',
 'libraries.csv',
 'cultural-spaces.csv',
 'datasets_517172_952401_train.csv',
 'public-art-artists.csv',
 'eo4.csv',
 'currency_exchange.csv',
 'eo_pr.csv',
 'road-ahead-projects-under-construction.csv',
 'ability_ids.csv',
 'population-by-governorate-citizenship-and-gender.csv',
 'community-centres.csv',
 'street-intersections.csv',
 'population-census-of-botswana-2011.csv']

In [11]:
df_dsInformation = pd.read_csv('datasetInformation_testbedXS.csv')

In [80]:
joins_dict = {}
step = 5

exclude_files = ['']

for file in files:
    if(file in exclude_files):
        continue
    
    info  = df_dsInformation[df_dsInformation['filename'] == file]                
    table = pd.read_csv(f'datasets/{file}', delimiter=info['delimiter'].values[0])

    table_descriptions = descriptions[descriptions['TableName'] == file]
    candidate_tables       = descriptions[descriptions['TableName'] != file].iloc[:,0]
    candidate_descriptions = descriptions[descriptions['TableName'] != file].iloc[:,2]

    joins_dict[f'{file}'] = {}
    
    for i in tqdm(range(table_descriptions.shape[0])):

        joins_dict[f'{file}'][f'{table_descriptions.iloc[i, 1]}'] = ''
        
        for j in range(0, len(candidate_tables), step):
            target_description = table_descriptions.iloc[i, 2]

            sample_cadidate_tables = candidate_tables[j:j+step]
            sample_candidate_descriptions = candidate_descriptions[j:j+step]

            prompt_joins = generate_prompt_predict_possible_joins(target_description, sample_cadidate_tables, sample_candidate_descriptions)
            split_key_domain = "Possible JOINs:"
            sample_joins = predict(table, prompt_joins, model, tokenizer, split_key_domain)

            if((sample_joins != 'none') or (sample_joins != 'None')):
                joins_dict[f'{file}'][f'{table_descriptions.iloc[i, 1]}'] += ', ' + sample_joins

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [09:37<00:00, 64.18s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [06:52<00:00, 68.68s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [02:07<00:00, 63.73s/it]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [21:08<00:00, 66.78s/it]
100%|███████████████████████████████████████████

In [81]:
with open('Description_test/join_prediction_LLaMa-3.2-11b_v1.json', 'w') as file:
    json.dump(joins_dict, file)

In [None]:
with open('Description_test/join_prediction_v7_LLaMa-3.2-11b.json', 'r') as file:
    joins_dict = json.load(file)