In [106]:
!pip install python-Levenshtein
!pip install scikit-learn
!pip install numpy


[31mERROR: Operation cancelled by user[0m[31m


In [108]:
import os
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
import Levenshtein
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

In [109]:
def levenshtein_distance(str1, str2):
    return Levenshtein.distance(str1, str2)

In [110]:
def schema_mapping_levenshtein(schema1, schema2, schema1_values, schema2_values, threshold=0.5):
    # Calculate the Levenshtein distance matrix for column names
    name_distance_matrix = cdist(
        np.array(schema1).reshape(-1, 1),
        np.array(schema2).reshape(-1, 1),
        lambda x, y: levenshtein_distance(x[0], y[0])
    )
    max_name_distance = np.max(name_distance_matrix)
    name_similarity_matrix = 1 - (name_distance_matrix / max_name_distance)

    # Calculate the average Levenshtein distance for sample values
    value_distance_matrix = np.zeros((len(schema1), len(schema2)))
    for i, col1 in enumerate(schema1):
        for j, col2 in enumerate(schema2):
            distances = [levenshtein_distance(val1, val2) for val1, val2 in zip(schema1_values[col1], schema2_values[col2])]
            avg_distance = np.mean(distances)
            value_distance_matrix[i, j] = avg_distance
    max_value_distance = np.max(value_distance_matrix)
    value_similarity_matrix = 1 - (value_distance_matrix / max_value_distance)

    # Combine name and value similarities
    combined_similarity_matrix = (name_similarity_matrix + value_similarity_matrix) / 2

    # Apply the threshold to convert values to binary
    binary_matrix = (combined_similarity_matrix >= threshold).astype(int)

    # Create a DataFrame for better visualization
    distance_df = pd.DataFrame(binary_matrix, index=schema1, columns=schema2)
    print("Combined Binary Similarity Matrix:")
    print(distance_df)

    # Find the best matches based on the maximum similarity
    schema_mapping = {}
    for col1 in schema1:
        best_match = distance_df.loc[col1].idxmax()
        schema_mapping[col1] = best_match

    return schema_mapping, binary_matrix

In [111]:
def jaccard_similarity(str1, str2):
    set1 = set(str1)
    set2 = set(str2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

def jaccard_distance(str1, str2):
    return 1 - jaccard_similarity(str1, str2)

# Schema Mapping using Jaccard Distance
def schema_mapping_jaccard(schema1, schema2, schema1_values, schema2_values, threshold=0.5):
    # Calculate the Jaccard distance matrix for column names
    name_distance_matrix = cdist(
        np.array(schema1).reshape(-1, 1),
        np.array(schema2).reshape(-1, 1),
        lambda x, y: jaccard_distance(x[0], y[0])
    )
    name_similarity_matrix = 1 - name_distance_matrix

    # Calculate the average Jaccard distance for sample values
    value_distance_matrix = np.zeros((len(schema1), len(schema2)))
    for i, col1 in enumerate(schema1):
        for j, col2 in enumerate(schema2):
            distances = [jaccard_distance(val1, val2) for val1, val2 in zip(schema1_values[col1], schema2_values[col2])]
            avg_distance = np.mean(distances)
            value_distance_matrix[i, j] = avg_distance
    value_similarity_matrix = 1 - value_distance_matrix

    # Combine name and value similarities
    combined_similarity_matrix = (name_similarity_matrix + value_similarity_matrix) / 2

    # Apply the threshold to convert values to binary
    binary_matrix = (combined_similarity_matrix >= threshold).astype(int)

    # Create a DataFrame for better visualization
    distance_df = pd.DataFrame(binary_matrix, index=schema1, columns=schema2)
    print("Combined Binary Similarity Matrix:")
    print(distance_df)

    # Find the best matches based on the maximum similarity
    schema_mapping = {}
    for col1 in schema1:
        best_match = distance_df.loc[col1].idxmax()
        schema_mapping[col1] = best_match

    return schema_mapping, binary_matrix

In [112]:
def edit_distance(str1, str2):
    m = len(str1)
    n = len(str2)

    # Create a distance matrix
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the matrix
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    # Fill the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],        # deletion
                                   dp[i][j - 1],        # insertion
                                   dp[i - 1][j - 1])    # substitution

    return dp[m][n]


In [113]:
def monge_elkan_similarity(str1, str2):
    words1 = str1.split()
    words2 = str2.split()
    if not words1 or not words2:
        return 0  # If one of the strings is empty, similarity is 0
    max_similarities = []
    for word1 in words1:
        max_similarity = max([1 - (edit_distance(word1, word2) / max(len(word1), len(word2))) for word2 in words2])
        max_similarities.append(max_similarity)
    return np.mean(max_similarities)

def monge_elkan_distance(str1, str2):
    return 1 - monge_elkan_similarity(str1, str2)

# Schema Mapping using Monge-Elkan Distance
def schema_mapping_monge_elkan(schema1, schema2, schema1_values, schema2_values, threshold=0.5):
    # Calculate the Monge-Elkan distance matrix for column names
    name_distance_matrix = cdist(
        np.array(schema1).reshape(-1, 1),
        np.array(schema2).reshape(-1, 1),
        lambda x, y: monge_elkan_distance(x[0], y[0])
    )
    name_similarity_matrix = 1 - name_distance_matrix

    # Calculate the average Monge-Elkan distance for sample values
    value_distance_matrix = np.zeros((len(schema1), len(schema2)))
    for i, col1 in enumerate(schema1):
        for j, col2 in enumerate(schema2):
            distances = [monge_elkan_distance(val1, val2) for val1, val2 in zip(schema1_values[col1], schema2_values[col2])]
            avg_distance = np.mean(distances)
            value_distance_matrix[i, j] = avg_distance
    value_similarity_matrix = 1 - value_distance_matrix

    # Combine name and value similarities
    combined_similarity_matrix = (name_similarity_matrix + value_similarity_matrix) / 2

    # Apply the threshold to convert values to binary
    binary_matrix = (combined_similarity_matrix >= threshold).astype(int)

    # Create a DataFrame for better visualization
    distance_df = pd.DataFrame(binary_matrix, index=schema1, columns=schema2)
    print("Combined Binary Similarity Matrix:")
    print(distance_df)

    # Find the best matches based on the maximum similarity
    schema_mapping = {}
    for col1 in schema1:
        best_match = distance_df.loc[col1].idxmax()
        schema_mapping[col1] = best_match

    return schema_mapping, binary_matrix

In [114]:
def calculate_scores(predicted_mapping, true_mapping):
    y_true_binary = []
    y_pred_binary = []

    # Create a binary vector for y_true and y_pred
    for key in true_mapping.keys():
        true_value = true_mapping[key]
        pred_value = predicted_mapping.get(key, None)

        if pred_value is not None:
            y_true_binary.append(1)
            if pred_value == true_value:
                y_pred_binary.append(1)
            else:
                y_pred_binary.append(0)
        else:
            y_true_binary.append(1)
            y_pred_binary.append(0)

    # Calculate precision, recall, and F1 score
    precision = precision_score(y_true_binary, y_pred_binary, zero_division=1)
    recall = recall_score(y_true_binary, y_pred_binary, zero_division=1)
    f1 = f1_score(y_true_binary, y_pred_binary, zero_division=1)

    return precision, recall, f1

In [115]:
current_dir = os.getcwd()

In [116]:
current_dir

'/content'

In [117]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [118]:
current_dir = '/content/drive/My Drive/ProgrammingFiles/SemanticTask2/'

In [119]:
schema1_path = os.path.join(current_dir , 'imdb.csv')
schema2_path = os.path.join(current_dir, 'rottentomatoes.csv')
schema3_path = os.path.join(current_dir, 'Bollywood.csv')


# Read the CSV files


In [120]:
try:
    schema1_df = pd.read_csv(schema1_path, delimiter=',')
    schema2_df = pd.read_csv(schema2_path, delimiter=',')
    schema3_df = pd.read_csv(schema3_path, delimiter=',')

except pd.errors.ParserError as e:
    print(f"Error reading CSV files: {e}")
    raise

In [121]:


# Create MovieOverview DataFrame
movie_overview_df = pd.DataFrame({
    'id':  schema1_df['Id'].fillna('') + schema2_df['Id'].fillna(''),
    'Name': schema3_df['Name'].fillna('') + schema1_df['Name'].fillna('') + schema2_df['Name'].fillna(''),
    'ReleaseDate': schema3_df['Release Date'].fillna('') + schema1_df['ReleaseDate'].fillna('') + schema2_df['Release Date'].fillna(''),
    'Director': schema3_df['Director'].fillna('') + schema1_df['Director'].fillna('') + schema2_df['Director'].fillna(''),
    'Creator': schema3_df['Producer'].fillna('') + schema1_df['Creator'].fillna('') + schema2_df['Creator'].fillna('')
})

# Create MovieDetail DataFrame
movie_detail_df = pd.DataFrame({
    'Name': schema3_df['Name'].fillna('') + schema1_df['Name'].fillna('') + schema2_df['Name'].fillna(''),
    'Genre': schema3_df['Genre'].fillna('') + schema1_df['Genre'].fillna('') + schema2_df['Genre'].fillna(''),
    'Description': schema1_df['Description'].fillna('') + schema2_df['Description'].fillna(''),
    'Duration': schema3_df['Runtime'].fillna('') + schema1_df['Duration'].fillna('') + schema2_df['Duration'].fillna('')
})

# Fill missing values with NaN and drop any fully empty rows
movie_overview_df.replace('', pd.NA, inplace=True)
movie_overview_df.dropna(how='all', inplace=True)

movie_detail_df.replace('', pd.NA, inplace=True)
movie_detail_df.dropna(how='all', inplace=True)

# Save to CSV files
movie_overview_df.to_csv(f'{current_dir}/MovieOverview.csv', index=False)
movie_detail_df.to_csv(f'{current_dir}/MovieDetail.csv', index=False)


In [122]:
MovieMediated = os.path.join(current_dir , 'MovieOverview.csv')
MovieDetail = os.path.join(current_dir, 'MovieDetail.csv')

MovieMediated_df = pd.read_csv(MovieMediated, delimiter=',')
MovieDetail_df = pd.read_csv(MovieDetail, delimiter=',')

In [123]:
print("Schema1 Columns:", schema1_df.columns.tolist())
print("Schema2 Columns:", schema2_df.columns.tolist())
print("Schema3 Columns:", schema3_df.columns.tolist())
print("MovieMediated Columns:", MovieMediated_df.columns.tolist())
print("MovieDetail Columns:", movie_detail_df.columns.tolist())


MovieMediatedColumns = MovieMediated_df.columns.tolist()
MovieDetailColumns = movie_detail_df.columns.tolist()
schema1Columns = schema1_df.columns.tolist()
schema2Columns = schema2_df.columns.tolist()
schema3Columns = schema3_df.columns.tolist()


schema1_values = {col: schema1_df[col].astype(str).tolist()[:10] for col in schema1Columns}
schema2_values = {col: schema2_df[col].astype(str).tolist()[:10] for col in schema2Columns}
schema3_values = {col: schema3_df[col].astype(str).tolist()[:10] for col in schema3Columns}
MovieMediatedValues= {col: MovieMediated_df[col].astype(str).tolist()[:10] for col in MovieMediatedColumns}
MovieDetailValues= {col: movie_detail_df[col].astype(str).tolist()[:10] for col in MovieDetailColumns}




Schema1 Columns: ['Id', 'Name', 'YearRange', 'ReleaseDate', 'Director', 'Creator', 'Cast', 'Duration', 'RatingValue', 'ContentRating', 'Genre', 'Url', 'Description']
Schema2 Columns: ['Id', 'Name', 'Year', 'Release Date', 'Director', 'Creator', 'Actors', 'Cast', 'Language', 'Country', 'Duration', 'RatingValue', 'RatingCount', 'ReviewCount', 'Genre', 'Filming Locations', 'Description']
Schema3 Columns: ['Unnamed: 0', 'Actors 1', 'Actors 2', 'Actors 3', 'Actors 4', 'Actors 5', 'Actors 6', 'Actors 7', 'Budget', 'Dialogue', 'Director', 'Directors', 'First Day', 'First Week', 'First Weekend', 'Genre', 'India Gross', 'Lyrics', 'Music', 'Overseas Gross', 'Producer', 'Production Banner', 'Release Date', 'Runtime', 'Screenplay', 'Screens', 'Story', 'Name', 'Worldwide Gross']
MovieMediated Columns: ['id', 'Name', 'ReleaseDate', 'Director', 'Creator']
MovieDetail Columns: ['Name', 'Genre', 'Description', 'Duration']


In [124]:


# Check if 'column_name' exists in the DataFrames

threshold =0.7

predicted_mapping, binary_matrix = schema_mapping_levenshtein(MovieMediatedColumns, schema1Columns, MovieMediatedValues ,schema1_values , threshold)
print("\nPredicted Schema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")

true_mapping_MovieMediated = {
"Name":"Name",
"ReleaseDate":"Release Date",
"Director": "Director " ,
"Creator" : "Creator"

}
# Precision, Recall, and F1 Score Calculation

precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping_MovieMediated)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Id  Name  YearRange  ReleaseDate  Director  Creator  Cast  \
id            1     1          0            0         0        0     1   
Name          1     1          0            0         0        1     1   
ReleaseDate   0     0          0            1         0        0     0   
Director      0     0          0            0         1        1     0   
Creator       0     0          0            0         1        1     0   

             Duration  RatingValue  ContentRating  Genre  Url  Description  
id                  0            0              0      1    1            0  
Name                0            0              0      1    1            0  
ReleaseDate         0            0              0      0    0            0  
Director            1            0              0      0    0            0  
Creator             1            0              0      0    0            0  

Predicted Schema Mapping:
id -> Id
Name -> Id
ReleaseDate

In [125]:

threshold =0.7

# Perform schema mapping
predicted_mapping, binary_matrix = schema_mapping_levenshtein(MovieMediatedColumns, schema2Columns, MovieMediatedValues ,schema2_values , threshold)
print("\nSchema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")


true_mapping_MovieMediated = {
"Name":"Name",
"ReleaseDate":"Release Date",
"Director": "Director " ,
"Creator" : "Creator"

}
print(predicted_mapping)

precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping_MovieMediated)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Id  Name  Year  Release Date  Director  Creator  Actors  Cast  \
id            1     1     1             0         1        1       1     0   
Name          1     1     1             0         0        1       1     0   
ReleaseDate   0     0     0             1         0        0       0     0   
Director      0     0     1             0         1        1       1     0   
Creator       0     0     0             0         1        1       0     0   

             Language  Country  Duration  RatingValue  RatingCount  \
id                  0        1         1            0            0   
Name                1        0         0            0            0   
ReleaseDate         0        0         0            0            0   
Director            0        0         1            0            0   
Creator             0        0         0            0            0   

             ReviewCount  Genre  Filming Locations  Description  
id       

In [126]:

threshold =0.7

# Perform schema mapping
predicted_mapping, binary_matrix = schema_mapping_levenshtein(MovieMediatedColumns, schema3Columns, MovieMediatedValues ,schema3_values , threshold)
print("\nSchema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")


true_mapping_MovieMediated = {
"Name":"Name",
"ReleaseDate":"Release Date",
"Director": "Director " ,
"Creator" : "Creator"

}
print(predicted_mapping)

precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping_MovieMediated)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Unnamed: 0  Actors 1  Actors 2  Actors 3  Actors 4  Actors 5  \
id                    0         0         0         0         0         0   
Name                  0         0         0         0         0         0   
ReleaseDate           0         0         0         0         0         0   
Director              0         0         0         0         0         0   
Creator               0         0         0         0         0         0   

             Actors 6  Actors 7  Budget  Dialogue  ...  Overseas Gross  \
id                  0         0       0         0  ...               0   
Name                0         0       0         0  ...               0   
ReleaseDate         0         0       0         0  ...               0   
Director            0         0       0         0  ...               0   
Creator             0         0       0         0  ...               0   

             Producer  Production Banner  Release Date  R

In [127]:
threshold =0.7

# Perform schema mapping
predicted_mapping, binary_matrix = schema_mapping_jaccard(MovieMediatedColumns, schema1Columns, MovieMediatedValues ,schema1_values , threshold)
print("\nPredicted Schema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")

# Define the ground truth mapping
true_mapping = {
    "Name": "Name",
    "ReleaseDate": "Release Date",
    "Director": "Director",
    "Creator": "Creator"
}

# Calculate precision, recall, and F1 score
precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Id  Name  YearRange  ReleaseDate  Director  Creator  Cast  \
id            0     0          0            0         0        0     0   
Name          0     1          0            0         0        0     0   
ReleaseDate   0     0          0            1         0        0     0   
Director      0     0          0            0         1        0     0   
Creator       0     0          0            0         0        1     0   

             Duration  RatingValue  ContentRating  Genre  Url  Description  
id                  0            0              0      0    0            0  
Name                0            0              0      0    0            0  
ReleaseDate         0            0              0      0    0            0  
Director            0            0              0      0    0            0  
Creator             0            0              0      0    0            0  

Predicted Schema Mapping:
id -> Id
Name -> Name
ReleaseDa

In [128]:
threshold =0.7
predicted_mapping, binary_matrix = schema_mapping_jaccard(MovieMediatedColumns, schema2Columns, MovieMediatedValues ,schema2_values , threshold)
print("\nPredicted Schema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")

# Define the ground truth mapping
true_mapping = {
    "Name": "Name",
    "ReleaseDate": "Release Date",
    "Director": "Director",
    "Creator": "Creator"
}

# Calculate precision, recall, and F1 score
precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Id  Name  Year  Release Date  Director  Creator  Actors  Cast  \
id            0     0     0             0         0        0       0     0   
Name          0     1     0             0         0        0       0     0   
ReleaseDate   0     0     0             1         0        0       0     0   
Director      0     0     0             0         1        0       0     0   
Creator       0     0     0             0         0        1       0     0   

             Language  Country  Duration  RatingValue  RatingCount  \
id                  0        0         0            0            0   
Name                0        0         0            0            0   
ReleaseDate         0        0         0            0            0   
Director            0        0         0            0            0   
Creator             0        0         0            0            0   

             ReviewCount  Genre  Filming Locations  Description  
id       

In [129]:
threshold =0.7

# Perform schema mapping
predicted_mapping, binary_matrix = schema_mapping_jaccard(MovieMediatedColumns, schema3Columns, MovieMediatedValues ,schema3_values , threshold)
print("\nPredicted Schema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")

# Define the ground truth mapping
true_mapping = {
    "Name": "Name",
    "ReleaseDate": "Release Date",
    "Director": "Director",
    "Creator": "Creator"
}

# Calculate precision, recall, and F1 score
precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Unnamed: 0  Actors 1  Actors 2  Actors 3  Actors 4  Actors 5  \
id                    0         0         0         0         0         0   
Name                  0         0         0         0         0         0   
ReleaseDate           0         0         0         0         0         0   
Director              0         0         0         0         0         0   
Creator               0         0         0         0         0         0   

             Actors 6  Actors 7  Budget  Dialogue  ...  Overseas Gross  \
id                  0         0       0         0  ...               0   
Name                0         0       0         0  ...               0   
ReleaseDate         0         0       0         0  ...               0   
Director            0         0       0         0  ...               0   
Creator             0         0       0         0  ...               0   

             Producer  Production Banner  Release Date  R

In [130]:
threshold =0.7

predicted_mapping, binary_matrix = schema_mapping_monge_elkan(MovieMediatedColumns, schema1Columns, MovieMediatedValues ,schema1_values , threshold)
print("\nPredicted Schema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")

# Define the ground truth mapping
true_mapping = {
    "Name": "Name",
    "ReleaseDate": "Release Date",
    "Director": "Director",
    "Creator": "Creator"
}

# Calculate precision, recall, and F1 score
precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Id  Name  YearRange  ReleaseDate  Director  Creator  Cast  \
id            0     0          0            0         0        0     0   
Name          0     1          0            0         0        0     0   
ReleaseDate   0     0          0            1         0        0     0   
Director      0     0          0            0         0        0     0   
Creator       0     0          0            0         0        1     0   

             Duration  RatingValue  ContentRating  Genre  Url  Description  
id                  0            0              0      0    0            0  
Name                0            0              0      0    0            0  
ReleaseDate         0            0              0      0    0            0  
Director            0            0              0      0    0            0  
Creator             0            0              0      0    0            0  

Predicted Schema Mapping:
id -> Id
Name -> Name
ReleaseDa

In [None]:
threshold =0.7

predicted_mapping, binary_matrix = schema_mapping_monge_elkan(MovieMediatedColumns, schema2Columns, MovieMediatedValues ,schema2_values , threshold)
print("\nPredicted Schema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")

# Define the ground truth mapping
true_mapping = {
    "Name": "Name",
    "ReleaseDate": "Release Date",
    "Director": "Director",
    "Creator": "Creator"
}

# Calculate precision, recall, and F1 score
precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Id  Name  Year  Release Date  Director  Creator  Actors  Cast  \
Name          0     0     0             0         0        0       0     0   
ReleaseDate   0     0     0             0         0        0       0     0   
Director      0     0     0             0         1        0       0     0   
Creator       0     0     0             0         0        1       0     0   

             Language  Country  Duration  RatingValue  RatingCount  \
Name                0        0         0            0            0   
ReleaseDate         0        0         0            0            0   
Director            0        0         0            0            0   
Creator             0        0         0            0            0   

             ReviewCount  Genre  Filming Locations  Description  
Name                   0      0                  0            0  
ReleaseDate            0      0                  0            0  
Director               0 

In [None]:
threshold =0.7

predicted_mapping, binary_matrix = schema_mapping_monge_elkan(MovieMediatedColumns, schema3Columns, MovieMediatedValues ,schema3_values , threshold)
print("\nPredicted Schema Mapping:")
for key, value in predicted_mapping.items():
    print(f"{key} -> {value}")

# Define the ground truth mapping
true_mapping = {
    "Name": "Name",
    "ReleaseDate": "Release Date",
    "Director": "Director",
    "Creator": "Creator"
}

# Calculate precision, recall, and F1 score
precision, recall, f1 = calculate_scores(predicted_mapping, true_mapping)
print("\nPrecision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Combined Binary Similarity Matrix:
             Unnamed: 0  Actors 1  Actors 2  Actors 3  Actors 4  Actors 5  \
Name                  0         0         0         0         0         0   
ReleaseDate           0         0         0         0         0         0   
Director              0         0         0         0         0         0   
Creator               0         0         0         0         0         0   

             Actors 6  Actors 7  Budget  Dialogue  ...  Overseas Gross  \
Name                0         0       0         0  ...               0   
ReleaseDate         0         0       0         0  ...               0   
Director            0         0       0         0  ...               0   
Creator             0         0       0         0  ...               0   

             Producer  Production Banner  Release Date  Runtime  Screenplay  \
Name                0                  0             0        0           0   
ReleaseDate         0                  0          

In [None]:

import numpy as np
from scipy.spatial.distance import cdist

# Affine gap penalty parameters
GAP_OPEN = 1    # Cost for opening a gap
GAP_EXTEND = 0.5  # Cost for extending a gap (per character)

# Jaccard Similarity Function
def jaccard_similarity(str1, str2):
    set1 = set(str1)
    set2 = set(str2)
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union

# Jaccard Distance Function
def jaccard_distance(str1, str2):
    return 1 - jaccard_similarity(str1, str2)

# Edit Distance Function
def edit_distance(str1, str2):
    m = len(str1)
    n = len(str2)

    # Create a distance matrix
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the matrix
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    # Fill the matrix
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],        # deletion
                                   dp[i][j - 1],        # insertion
                                   dp[i - 1][j - 1])    # substitution

    return dp[m][n]

# Affine Gap Distance Function
def affine_gap_distance(str1, str2):
    m = len(str1)
    n = len(str2)

    # Initialize scoring matrices
    score = np.zeros((m + 1, n + 1))
    gap_open = np.zeros((m + 1, n + 1))
    gap_extend = np.zeros((m + 1, n + 1))

    # Initialize gap penalties
    for i in range(1, m + 1):
        score[i][0] = GAP_OPEN + i * GAP_EXTEND
    for j in range(1, n + 1):
        score[0][j] = GAP_OPEN + j * GAP_EXTEND

    # Fill the matrices
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            substitution_cost = 0 if str1[i - 1] == str2[j - 1] else 1
            score[i][j] = min(
                score[i - 1][j - 1] + substitution_cost,
                gap_open[i - 1][j - 1] + substitution_cost,
                score[i][j - 1] + GAP_OPEN + GAP_EXTEND,
                score[i - 1][j] + GAP_OPEN + GAP_EXTEND
            )
            gap_open[i][j] = min(
                score[i][j - 1] + GAP_EXTEND,
                score[i - 1][j] + GAP_EXTEND
            )

    return score[m][n]

def weighted_average_schema_mapping(schema1, schema2, schema1_values, schema2_values, weights=None):
    if weights is None:
        weights = {
            'jaccard': 1.0,
            'edit': 1.0,
            'affine_gap': 1.0
        }

    # Calculate the distance matrices for column names
    jaccard_name_matrix = cdist(
        np.array(schema1).reshape(-1, 1),
        np.array(schema2).reshape(-1, 1),
        lambda x, y: jaccard_distance(x[0], y[0])
    )
    edit_name_matrix = cdist(
        np.array(schema1).reshape(-1, 1),
        np.array(schema2).reshape(-1, 1),
        lambda x, y: edit_distance(x[0], y[0])
    )
    affine_gap_name_matrix = cdist(
        np.array(schema1).reshape(-1, 1),
        np.array(schema2).reshape(-1, 1),
        lambda x, y: affine_gap_distance(x[0], y[0])
    )

    # Calculate the average distance for sample values
    jaccard_value_matrix = np.zeros((len(schema1), len(schema2)))
    edit_value_matrix = np.zeros((len(schema1), len(schema2)))
    affine_gap_value_matrix = np.zeros((len(schema1), len(schema2)))
    for i, col1 in enumerate(schema1):
        for j, col2 in enumerate(schema2):
            jaccard_distances = [jaccard_distance(val1, val2) for val1, val2 in zip(schema1_values[col1], schema2_values[col2])]
            edit_distances = [edit_distance(val1, val2) for val1, val2 in zip(schema1_values[col1], schema2_values[col2])]
            affine_gap_distances = [affine_gap_distance(val1, val2) for val1, val2 in zip(schema1_values[col1], schema2_values[col2])]
            jaccard_value_matrix[i, j] = np.mean(jaccard_distances)
            edit_value_matrix[i, j] = np.mean(edit_distances)
            affine_gap_value_matrix[i, j] = np.mean(affine_gap_distances)

    # Combine the matrices using the provided weights
    combined_name_matrix = (
        weights['jaccard'] * jaccard_name_matrix +
        weights['edit'] * edit_name_matrix +
        weights['affine_gap'] * affine_gap_name_matrix
    )

    combined_value_matrix = (
        weights['jaccard'] * jaccard_value_matrix +
        weights['edit'] * edit_value_matrix +
        weights['affine_gap'] * affine_gap_value_matrix
    )

    # Create a DataFrame for better visualization
    combined_matrix = combined_name_matrix + combined_value_matrix
    distance_df = pd.DataFrame(combined_matrix, index=schema1, columns=schema2)
    print("Combined Weighted Average Distance Matrix:")
    print(distance_df)

    # Find the best matches based on the minimum distance
    schema_mapping = {}
    for col1 in schema1:
        best_match = distance_df.loc[col1].idxmin()
        schema_mapping[col1] = best_match

    return schema_mapping



weights = {
    'jaccard': 0.5,
    'edit': 1.0,
    'affine_gap': 0.75
}
predicted_mapping = weighted_average_schema_mapping(MovieMediatedColumns, schema3Columns, MovieMediatedValues ,schema3_values , weights)

# Print the predicted schema mapping
print("Predicted Schema Mapping:")
for col1, col2 in predicted_mapping.items():
    print(f"{col1} -> {col2}")

Combined Weighted Average Distance Matrix:
             Unnamed: 0   Actors 1   Actors 2   Actors 3   Actors 4  \
Name          51.400000  53.968695  53.453593  53.802666  53.201055   
ReleaseDate   61.760372  66.105703  65.712128  66.839416  66.048473   
Director      59.566667  56.520552  56.964969  56.761865  56.184470   
Creator       94.873077  90.752352  91.495915  91.052855  90.689206   

              Actors 5   Actors 6   Actors 7     Budget   Dialogue  ...  \
Name         53.812986  52.207769  52.385560  55.617593  55.758021  ...   
ReleaseDate  66.171266  64.863373  64.671680  63.779070  67.042536  ...   
Director     57.339027  55.316682  55.951825  62.407367  56.185629  ...   
Creator      91.399801  90.449862  90.886283  96.382541  92.180283  ...   

             Overseas Gross   Producer  Production Banner  Release Date  \
Name              61.957239  55.237500          64.762697     55.896315   
ReleaseDate       65.454514  67.679620          75.590704     44.453349   
