In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime

# Set working directory
working_dir = Path.cwd()

# File paths
formats_file = working_dir / "formats_dictionary.txt"
sherlock_file = working_dir / "SherlockTypes.csv"
output_file = working_dir / "Sherlock_to_HeadersIQ_Mapping_CLEAN.csv"

# Step 1: Load dictionary into reverse_map
reverse_map = {}
with open(formats_file, "r", encoding="utf-8") as f:
    for line in f:
        if ":" in line:
            key, value = line.strip().split(":", 1)
            key = key.strip().strip("'").strip('"').lower()
            value = value.strip().strip("'").strip('"')
            reverse_map.setdefault(value, set()).add(key)

# Step 2: Create a flat set of all dictionary keywords
all_headersiq_formats = set(reverse_map.keys())

# Step 3: Load Sherlock formats and map
df_sherlock = pd.read_csv(sherlock_file, header=None, names=["SherlockFormat"])

def map_to_headersiq(sherlock_format):
    fmt = str(sherlock_format).strip().lower()
    for hq_format, keywords in reverse_map.items():
        if fmt in keywords or fmt == hq_format:
            return hq_format
    return "<no match>"

df_sherlock["HeadersIQ_Format"] = df_sherlock["SherlockFormat"].apply(map_to_headersiq)
df_sherlock["HeadersIQ_Format"] = df_sherlock["HeadersIQ_Format"].str.strip(" ,'\"")
df_sherlock.to_csv(output_file, index=False)

# Step 4: Compute formats that Sherlock does NOT cover
mapped_formats = set(df_sherlock["HeadersIQ_Format"].unique()) - {"<no match>"}
unmapped_formats = sorted(all_headersiq_formats - mapped_formats)

# Display results
print(f"âœ… File saved at: {output_file.resolve()}")
print("\nðŸŸ¨ HeadersIQ formats NOT covered by Sherlock:")
for fmt in unmapped_formats:
    print(f"- {fmt}")

print(f"\nLast run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


âœ… File saved at: C:\Users\290099c\OneDrive\Documentos\Android Meus Documentos\PhD\FINAL THESIS\THESIS CODES\Sherlock_to_HeadersIQ_Mapping_CLEAN.csv

ðŸŸ¨ HeadersIQ formats NOT covered by Sherlock:
- E-mailformat',
- IDcolumn',
- IPformat',
- URLformat
- URLformat',
- acidity',
- age',
- alkalinity',
- angle',
- binary',
- bloodpressure',
- categorical',
- city',
- country',
- date',
- datetime',
- day',
- heartrate',
- hour',
- latitude',
- longitude',
- modelname',
- money',
- month',
- name',
- normalized',
- numerical',
- numerical,
- numerical>=0'',
- numerical>=0',
- percentage',
- ph',
- phone',
- postalcode',
- saltness',
- state',
- street',
- string',
- tannins',
- time',
- week',
- weekday',
- year',

Last run on: 2025-06-04 14:16:47


In [None]:
# Re-execute after environment reset
from pathlib import Path
import pandas as pd
from datetime import datetime

# Reload formats dictionary with cleaning
formats_file = Path("formats_dictionary.txt")
reverse_map_cleaned = {}

with open(formats_file, "r", encoding="utf-8") as f:
    for line in f:
        if ":" in line:
            key, value = line.strip().split(":", 1)
            key = key.strip().strip("'\"").lower()
            value = value.strip().strip("'\",").lower()
            reverse_map_cleaned.setdefault(value, set()).add(key)

# Load Sherlock formats
sherlock_file = Path("SherlockTypes.csv")
df_sherlock = pd.read_csv(sherlock_file, header=None, names=["SherlockFormat"])
df_sherlock["SherlockFormat"] = df_sherlock["SherlockFormat"].str.strip().str.lower()

# Map function
def map_cleaned_format(sherlock_format):
    for hq_format, keywords in reverse_map_cleaned.items():
        if sherlock_format in keywords or sherlock_format == hq_format:
            return hq_format
    return "<no match>"

df_sherlock["HeadersIQ_Format"] = df_sherlock["SherlockFormat"].apply(map_cleaned_format)

# Save output
cleaned_output = Path("Sherlock_to_HeadersIQ_Mapping_CLEANED.csv")
df_sherlock.to_csv(cleaned_output, index=False)

# Identify unmapped formats from dictionary
mapped_formats = set(df_sherlock["HeadersIQ_Format"].unique()) - {"<no match>"}
all_headersiq_formats = set(reverse_map_cleaned.keys())
unmapped_formats = sorted(all_headersiq_formats - mapped_formats)


print(f"âœ… Cleaned mapping saved to: {cleaned_output.resolve()}")
print("\nðŸŸ¨ HeadersIQ formats NOT covered by Sherlock:")
for fmt in unmapped_formats:
    print(f"- {fmt}")

print(f"\nLast run on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


âœ… Cleaned mapping saved to: C:\Users\290099c\OneDrive\Documentos\Android Meus Documentos\PhD\FINAL THESIS\THESIS CODES\Sherlock_to_HeadersIQ_Mapping_CLEANED.csv

ðŸŸ¨ HeadersIQ formats NOT covered by Sherlock:
- acidity
- alkalinity
- angle
- binary
- bloodpressure
- datetime
- e-mailformat
- heartrate
- hour
- ipformat
- latitude
- longitude
- modelname
- month
- normalized
- percentage
- ph
- phone
- postalcode
- saltness
- tannins
- time
- urlformat
- week
- weekday

Last run on: 2025-06-04 14:51:29


: 

In [5]:
import pandas as pd

# 1. Load the Excel file (adjust the path to wherever AnalysedColumnsSato.xlsx lives)
df = pd.read_excel("AnalysedColumnsSato.xlsx")  # or "/mnt/data/AnalysedColumnsSato.xlsx"

# 2. Compute the frequency distributions
final_format_counts = df["FinalFormat"].value_counts().reset_index()
final_format_counts.columns = ["FinalFormat", "Count"]

source_keyword_counts = df["SourceKeyword"].value_counts().reset_index()
source_keyword_counts.columns = ["SourceKeyword", "Count"]

# 3. Print the results
print("=== FinalFormat Frequency Distribution ===")
print(final_format_counts.to_string(index=False))

print("\n=== SourceKeyword Frequency Distribution ===")
print(source_keyword_counts.to_string(index=False))



=== FinalFormat Frequency Distribution ===
 FinalFormat  Count
 categorical  49215
        name  19472
      string  12840
numerical>=0   7813
         age   7171
        year   5293
        city   5283
       state   4357
     country   1788
      street    841
         day    572
   numerical    351
    IDcolumn    227
       money     70
        date      5

=== SourceKeyword Frequency Distribution ===
 SourceKeyword  Count
          name  15827
   description  11144
          team  10348
           age   7171
          type   6526
      location   5340
          year   5293
          city   5281
          rank   4856
        status   4704
         state   4357
      category   3988
          code   2697
          club   2182
        artist   2136
        result   2086
       country   1714
        weight   1592
      position   1558
       company   1518
         album   1490
         class   1194
        symbol   1035
         notes   1002
       address    841
      duration    7