In [7]:
import pandas as pd
from lxml import etree
import os

def sanitize_column_name(col_name):
    # Replace spaces and other invalid characters with underscores
    return col_name.replace(' ', '_').replace('-', '_').replace('/', '_')

def clean_dataframe(df):
    # Drop unnamed columns that are completely empty
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    # Sanitize column names
    df.columns = [sanitize_column_name(col) for col in df.columns]
    return df

def csv_to_xml(df, root_element_name):
    root = etree.Element(root_element_name)
    for _, row in df.iterrows():
        item = etree.SubElement(root, "item")
        for col in df.columns:
            if pd.notna(row[col]):  # Check for NaN values
                child = etree.SubElement(item, col)
                child.text = str(row[col])
    return etree.tostring(root, pretty_print=True, encoding='utf-8')

# List of CSV files to read
csv_files = ['imdb.csv', 'bollywood.csv', 'rottentomatoes.csv', 'MovieOverview.csv']

# Load CSV files
dataframes = {}
for csv_file in csv_files:
    if os.path.exists(csv_file):
        df = pd.read_csv(csv_file, encoding='utf-8', index_col=False)
        dataframes[csv_file] = clean_dataframe(df)
    else:
        print(f"File {csv_file} not found in the current directory.")

# Convert to XML
imdb_xml = csv_to_xml(dataframes['imdb.csv'], 'IMDB')
bollywood_xml = csv_to_xml(dataframes['bollywood.csv'], 'Bollywood')
rottentomatoes_xml = csv_to_xml(dataframes['rottentomatoes.csv'], 'RottenTomatoes')
movie_review_xml = csv_to_xml(dataframes['MovieOverview.csv'], 'MovieOverview')

# Save XML files
with open('imdb.xml', 'wb') as f:
    f.write(imdb_xml)
with open('bollywood.xml', 'wb') as f:
    f.write(bollywood_xml)
with open('rottentomatoes.xml', 'wb') as f:
    f.write(rottentomatoes_xml)
with open('MovieOverview.xml', 'wb') as f:
    f.write(movie_review_xml)


In [9]:
from lxml import etree

def parse_xml(file_path):
    tree = etree.parse(file_path)
    return tree

def create_schema_tree_from_xml(xml_tree):
    schema_tree = {}
    root = xml_tree.getroot()
    for element in root.iter():
        schema_tree[element.tag] = type(element.text).__name__
    return schema_tree

# Parse XML files
imdb_tree = parse_xml('imdb.xml')
bollywood_tree = parse_xml('bollywood.xml')
rottentomatoes_tree = parse_xml('rottentomatoes.xml')
moviereview_tree = parse_xml('MovieOverview.xml')

# Create schema trees
imdb_schema_tree = create_schema_tree_from_xml(imdb_tree)
bollywood_schema_tree = create_schema_tree_from_xml(bollywood_tree)
rottentomatoes_schema_tree = create_schema_tree_from_xml(rottentomatoes_tree)
moviereview_schema_tree = create_schema_tree_from_xml(moviereview_tree)

# Save schema trees to a file
with open('schema_trees.txt', 'w') as f:
    f.write("IMDB Schema Tree:\n")
    f.write(str(imdb_schema_tree) + "\n\n")
    f.write("Bollywood Schema Tree:\n")
    f.write(str(bollywood_schema_tree) + "\n\n")
    f.write("Rotten Tomatoes Schema Tree:\n")
    f.write(str(rottentomatoes_schema_tree) + "\n\n")
    f.write("Movie Review Schema Tree:\n")
    f.write(str(moviereview_schema_tree) + "\n\n")


In [10]:
import numpy as np
from sklearn.metrics import jaccard_score

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def create_similarity_matrix(schema_tree1, schema_tree2):
    nodes1 = list(schema_tree1.keys())
    nodes2 = list(schema_tree2.keys())
    similarity_matrix = np.zeros((len(nodes1), len(nodes2)))

    for i, node1 in enumerate(nodes1):
        set1 = set(node1.split('_'))
        for j, node2 in enumerate(nodes2):
            set2 = set(node2.split('_'))
            similarity_matrix[i, j] = jaccard_similarity(set1, set2)

    return similarity_matrix, nodes1, nodes2

def match_nodes_with_similarity_matrix(similarity_matrix, nodes1, nodes2, threshold=0.5):
    matched_nodes = {}
    for i, node1 in enumerate(nodes1):
        for j, node2 in enumerate(nodes2):
            if similarity_matrix[i, j] >= threshold:
                matched_nodes[(node1, node2)] = similarity_matrix[i, j]
    return matched_nodes

# Create similarity matrices
imdb_moviereview_similarity_matrix, imdb_nodes, moviereview_nodes = create_similarity_matrix(imdb_schema_tree, moviereview_schema_tree)
bollywood_moviereview_similarity_matrix, bollywood_nodes, moviereview_nodes = create_similarity_matrix(bollywood_schema_tree, moviereview_schema_tree)
rottentomatoes_moviereview_similarity_matrix, rottentomatoes_nodes, moviereview_nodes = create_similarity_matrix(rottentomatoes_schema_tree, moviereview_schema_tree)

# Match nodes
imdb_moviereview_matches = match_nodes_with_similarity_matrix(imdb_moviereview_similarity_matrix, imdb_nodes, moviereview_nodes)
bollywood_moviereview_matches = match_nodes_with_similarity_matrix(bollywood_moviereview_similarity_matrix, bollywood_nodes, moviereview_nodes)
rottentomatoes_moviereview_matches = match_nodes_with_similarity_matrix(rottentomatoes_moviereview_similarity_matrix, rottentomatoes_nodes, moviereview_nodes)

# Save node matches to a file
with open('node_matches.txt', 'w') as f:
    f.write("IMDB - Movie Review Matches:\n")
    f.write(str(imdb_moviereview_matches) + "\n\n")
    f.write("Bollywood - Movie Review Matches:\n")
    f.write(str(bollywood_moviereview_matches) + "\n\n")
    f.write("Rotten Tomatoes - Movie Review Matches:\n")
    f.write(str(rottentomatoes_moviereview_matches) + "\n\n")
