In [5]:
import pandas as pd

result_df = pd.read_csv('../data/231229_FIRST_FINAL_RESULT.csv')

# Function to find all IDs that share a URL
def find_overlapping_ids(df):
    overlapping_ids = {}
    for url in df['OpenCorporates_url'].unique():
        ids_with_same_url = df[df['OpenCorporates_url'] == url]['id'].unique()
        for id in ids_with_same_url:
            if id not in overlapping_ids:
                overlapping_ids[id] = set()
            overlapping_ids[id].update(ids_with_same_url)
            overlapping_ids[id].remove(id)  # Remove self-reference
    return overlapping_ids

# Function to detect contradictions
def detect_contradictions(overlaps):
    contradictions = []
    for id, overlap_ids in overlaps.items():
        for overlap_id in overlap_ids:
            if overlap_id in overlaps and id in overlaps[overlap_id]:
                contradictions.append((id, overlap_id))
    return contradictions


In [6]:
result_df

Unnamed: 0,id,notifying_party,cleaned_name,preprocessed_id,name,company_id,key-identity-based-id,score,api_id,oc-gpt-id-1,legal_type,oc_name_preprocessed,OpenCorporates_url
0,0,ROTHENBERGER Vermögensverwaltung GbR 4xS,rothenberger vermoegensverwaltung gbr 4xs,1.0,,,1,,,1,1.0,,
1,1,"Rothenberger, Dr., Helmut",rothenberger dr helmut,2.0,,,2,,,2,0.0,,
2,2,"Schneider, Franz Jürgen",schneider franz juergen,3.0,,,3,,,3,0.0,,
3,3,3i Group plc,3i group plc,4.0,,,4,56.0,/companies/gb/01142830,4,1.0,3i group plc,https://opencorporates.com/companies/gb/01142830
4,4,3i Deutschland Gesellschaft für Industriebetei...,3i deutschland gesellschaft fuer industriebete...,5.0,,,5,52.0,/companies/de/M1201_HRB26172,5,1.0,3i deutschland gesellschaft fuer industriebete...,https://opencorporates.com/companies/de/M1201_...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23137,14453,"Hagemann, Hanno",hagemann hanno,13786.0,,,14185,,,14377,0.0,,
23138,14454,"Krüper, Sebastian",krueper sebastian,13787.0,,,14186,,,14378,0.0,,
23139,14455,"Müller, Tarek",mueller tarek,13788.0,,,14187,,,14379,0.0,,
23140,14456,"Riemer, Frank",riemer frank,13789.0,,,14188,,,14380,0.0,,


In [7]:
# Finding overlaps and contradictions
overlaps = find_overlapping_ids(result_df)
contradictions = detect_contradictions(overlaps)

# Printing the results
overlaps

{3: {1830, 5902},
 1830: {3, 5902},
 5902: {3, 1830},
 4: set(),
 8: {1518, 5342, 7704},
 1518: {8, 5342, 7704},
 5342: {8, 1518, 7704},
 7704: {8, 1518, 5342},
 10: {1350, 7705},
 1350: {10},
 7705: {10},
 11: set(),
 12: set(),
 13: set(),
 16: {2561, 5729, 5943, 8520},
 2561: {16, 5729, 5943, 8520},
 5729: {16, 2561, 5943, 8520},
 5943: {16, 2561, 5729, 8520},
 8520: {16, 2561, 5729, 5943},
 17: {1556, 5326},
 1556: {17, 5326},
 5326: {17, 1556},
 18: set(),
 19: {5676, 8898},
 5676: {19, 8898},
 8898: {19, 5676},
 23: {1557},
 1557: {23},
 26: set(),
 34: set(),
 35: set(),
 37: set(),
 48: {5387, 5782},
 5387: {48, 5782},
 5782: {48, 5387},
 50: {5171, 5667},
 5171: {50, 5667},
 5667: {50, 5171},
 51: set(),
 52: {693},
 693: {52},
 53: set(),
 55: set(),
 56: set(),
 64: {5166, 5688},
 5166: {64, 5688},
 5688: {64, 5166},
 69: set(),
 71: {10427},
 10427: {71},
 80: set(),
 82: set(),
 84: {5694, 6608},
 5694: {84, 6608},
 6608: {84, 5694},
 85: {5728},
 5728: {85},
 89: {8231},


In [8]:
contradictions

[(3, 1830),
 (3, 5902),
 (1830, 3),
 (1830, 5902),
 (5902, 3),
 (5902, 1830),
 (8, 7704),
 (8, 5342),
 (8, 1518),
 (1518, 8),
 (1518, 7704),
 (1518, 5342),
 (5342, 8),
 (5342, 7704),
 (5342, 1518),
 (7704, 8),
 (7704, 5342),
 (7704, 1518),
 (10, 7705),
 (10, 1350),
 (1350, 10),
 (7705, 10),
 (16, 2561),
 (16, 5729),
 (16, 8520),
 (16, 5943),
 (2561, 5729),
 (2561, 8520),
 (2561, 16),
 (2561, 5943),
 (5729, 2561),
 (5729, 8520),
 (5729, 16),
 (5729, 5943),
 (5943, 2561),
 (5943, 5729),
 (5943, 8520),
 (5943, 16),
 (8520, 2561),
 (8520, 5729),
 (8520, 16),
 (8520, 5943),
 (17, 1556),
 (17, 5326),
 (1556, 17),
 (1556, 5326),
 (5326, 17),
 (5326, 1556),
 (19, 8898),
 (19, 5676),
 (5676, 8898),
 (5676, 19),
 (8898, 19),
 (8898, 5676),
 (23, 1557),
 (1557, 23),
 (48, 5387),
 (48, 5782),
 (5387, 48),
 (5387, 5782),
 (5782, 48),
 (5782, 5387),
 (50, 5667),
 (50, 5171),
 (5171, 5667),
 (5171, 50),
 (5667, 50),
 (5667, 5171),
 (52, 693),
 (693, 52),
 (64, 5688),
 (64, 5166),
 (5166, 64),
 (5166,

In [13]:
import pandas as pd
import networkx as nx

# Sample DataFrame creation - replace this with your actual DataFrame loading method
# Assume df has columns 'ID' and 'URL'
data = {'ID': [1, 2, 2, 3], 'URL': ['url_a', 'url_a', 'url_b', 'url_b']}
df = result_df

# Create a graph
G = nx.Graph()

# Add edges based on shared URLs
for url in df['OpenCorporates_url'].unique():
    ids = df[df['OpenCorporates_url'] == url]['id'].tolist()
    for i in range(len(ids)):
        for j in range(i+1, len(ids)):
            G.add_edge(ids[i], ids[j])

# Find connected components (groups of IDs potentially belonging to the same company)
connected_components = list(nx.connected_components(G))

# Output the results
for i, component in enumerate(connected_components):
    print(f"Group {i+1}:")
    for id in component:
        urls = df[df['id'] == id]['OpenCorporates_url'].unique()
        print(f"  ID {id} - URLs: {', '.join(urls)}")
    print()


Group 1:
  ID 3 - URLs: https://opencorporates.com/companies/gb/01142830
  ID 1830 - URLs: https://opencorporates.com/companies/gb/01142830
  ID 5902 - URLs: https://opencorporates.com/companies/gb/01142830

Group 2:
  ID 8 - URLs: https://opencorporates.com/companies/de/R3101_HRB93
  ID 7704 - URLs: https://opencorporates.com/companies/de/R3101_HRB93, https://opencorporates.com/companies/de/R3306_HRB66277
  ID 5342 - URLs: https://opencorporates.com/companies/de/R3101_HRB93
  ID 1518 - URLs: https://opencorporates.com/companies/de/R3101_HRB93

Group 3:
  ID 7705 - URLs: https://opencorporates.com/companies/de/M1201_HRB21915, https://opencorporates.com/companies/de/K1101R_HRB40187, https://opencorporates.com/companies/de/D2601V_HRB41044, https://opencorporates.com/companies/de/D2601V_HRB177657
  ID 10 - URLs: https://opencorporates.com/companies/de/K1101R_HRB4649, https://opencorporates.com/companies/de/K1101R_HRB40187
  ID 1350 - URLs: https://opencorporates.com/companies/de/K1101R_HR

In [16]:
import pandas as pd
import networkx as nx

# Sample DataFrame creation - replace this with your actual DataFrame loading method
data = {'ID': [7705, 10, 10, 1350], 'URL': ['url1', 'url1', 'url2', 'url2']}
df = result_df

# Create a graph
G = nx.Graph()

# Add edges based on shared URLs
for url in df['OpenCorporates_url'].unique():
    ids = df[df['OpenCorporates_url'] == url]['id'].tolist()
    for i in range(len(ids)):
        for j in range(i+1, len(ids)):
            G.add_edge(ids[i], ids[j])

# Find connected components
connected_components = list(nx.connected_components(G))

# Function to find inconsistencies within a group
def find_inconsistencies(group, df):
    for id1 in group:
        urls_id1 = set(df[df['id'] == id1]['OpenCorporates_url'])
        for id2 in group:
            if id1 != id2:
                shared_urls = urls_id1.intersection(set(df[df['id'] == id2]['OpenCorporates_url']))
                for id3 in group:
                    if id1 != id3 and id2 != id3:
                        other_urls = set(df[df['id'] == id3]['OpenCorporates_url'])
                        if shared_urls.isdisjoint(other_urls):
                            return True  # Inconsistency found
    return False  # No inconsistencies

# Analyze each group for inconsistencies
for i, component in enumerate(connected_components):
    print(f"Group {i+1}:")
    inconsistent = find_inconsistencies(component, df)
    if inconsistent:
        print("  Inconsistent Group Detected")
    else:
        print("  Consistent Group")
    for id in component:
        urls = df[df['id'] == id]['OpenCorporates_url'].unique()
        print(f"  ID {id} - URLs: {', '.join(urls)}")
    print()


Group 1:
  Consistent Group
  ID 3 - URLs: https://opencorporates.com/companies/gb/01142830
  ID 1830 - URLs: https://opencorporates.com/companies/gb/01142830
  ID 5902 - URLs: https://opencorporates.com/companies/gb/01142830

Group 2:
  Consistent Group
  ID 8 - URLs: https://opencorporates.com/companies/de/R3101_HRB93
  ID 7704 - URLs: https://opencorporates.com/companies/de/R3101_HRB93, https://opencorporates.com/companies/de/R3306_HRB66277
  ID 5342 - URLs: https://opencorporates.com/companies/de/R3101_HRB93
  ID 1518 - URLs: https://opencorporates.com/companies/de/R3101_HRB93

Group 3:
  Inconsistent Group Detected
  ID 7705 - URLs: https://opencorporates.com/companies/de/M1201_HRB21915, https://opencorporates.com/companies/de/K1101R_HRB40187, https://opencorporates.com/companies/de/D2601V_HRB41044, https://opencorporates.com/companies/de/D2601V_HRB177657
  ID 10 - URLs: https://opencorporates.com/companies/de/K1101R_HRB4649, https://opencorporates.com/companies/de/K1101R_HRB40187

In [17]:
import pandas as pd
import networkx as nx

# Sample DataFrame creation - replace this with your actual DataFrame loading method
data = {'ID': [7705, 10, 10, 1350], 'URL': ['url1', 'url1', 'url2', 'url2']}
df = result_df

# Create a graph
G = nx.Graph()

# Add edges based on shared URLs
for url in df['OpenCorporates_url'].unique():
    ids = df[df['OpenCorporates_url'] == url]['id'].tolist()
    for i in range(len(ids)):
        for j in range(i+1, len(ids)):
            G.add_edge(ids[i], ids[j])

# Find connected components
connected_components = list(nx.connected_components(G))

# Function to find inconsistencies within a group
def find_inconsistencies(group, df):
    for id1 in group:
        urls_id1 = set(df[df['id'] == id1]['OpenCorporates_url'])
        for id2 in group:
            if id1 != id2:
                shared_urls = urls_id1.intersection(set(df[df['id'] == id2]['OpenCorporates_url']))
                for id3 in group:
                    if id1 != id3 and id2 != id3:
                        other_urls = set(df[df['id'] == id3]['OpenCorporates_url'])
                        if shared_urls.isdisjoint(other_urls):
                            return True  # Inconsistency found
    return False  # No inconsistencies

# Analyze each group for inconsistencies
for i, component in enumerate(connected_components):
    if len(component) >= 3:  # Check if the group has at least three IDs
        print(f"Group {i+1}:")
        inconsistent = find_inconsistencies(component, df)
        if inconsistent:
            print("  Inconsistent Group Detected")
        else:
            print("  Consistent Group")
        for id in component:
            urls = df[df['id'] == id]['OpenCorporates_url'].unique()
            print(f"  ID {id} - URLs: {', '.join(urls)}")
        print()


Group 1:
  Consistent Group
  ID 3 - URLs: https://opencorporates.com/companies/gb/01142830
  ID 1830 - URLs: https://opencorporates.com/companies/gb/01142830
  ID 5902 - URLs: https://opencorporates.com/companies/gb/01142830

Group 2:
  Consistent Group
  ID 8 - URLs: https://opencorporates.com/companies/de/R3101_HRB93
  ID 7704 - URLs: https://opencorporates.com/companies/de/R3101_HRB93, https://opencorporates.com/companies/de/R3306_HRB66277
  ID 5342 - URLs: https://opencorporates.com/companies/de/R3101_HRB93
  ID 1518 - URLs: https://opencorporates.com/companies/de/R3101_HRB93

Group 3:
  Inconsistent Group Detected
  ID 7705 - URLs: https://opencorporates.com/companies/de/M1201_HRB21915, https://opencorporates.com/companies/de/K1101R_HRB40187, https://opencorporates.com/companies/de/D2601V_HRB41044, https://opencorporates.com/companies/de/D2601V_HRB177657
  ID 10 - URLs: https://opencorporates.com/companies/de/K1101R_HRB4649, https://opencorporates.com/companies/de/K1101R_HRB40187

In [18]:
import pandas as pd
import networkx as nx

# Sample DataFrame creation - replace this with your actual DataFrame loading method
data = {'ID': [7705, 10, 10, 1350], 'URL': ['url1', 'url1', 'url2', 'url2']}
df = result_df

# Create a graph
G = nx.Graph()

# Add edges based on shared URLs
for url in df['OpenCorporates_url'].unique():
    ids = df[df['OpenCorporates_url'] == url]['id'].tolist()
    for i in range(len(ids)):
        for j in range(i+1, len(ids)):
            G.add_edge(ids[i], ids[j])

# Find connected components
connected_components = list(nx.connected_components(G))

# Function to find inconsistencies within a group
def find_inconsistencies(group, df):
    for id1 in group:
        urls_id1 = set(df[df['id'] == id1]['OpenCorporates_url'])
        for id2 in group:
            if id1 != id2:
                shared_urls = urls_id1.intersection(set(df[df['id'] == id2]['OpenCorporates_url']))
                for id3 in group:
                    if id1 != id3 and id2 != id3:
                        other_urls = set(df[df['id'] == id3]['OpenCorporates_url'])
                        if shared_urls.isdisjoint(other_urls):
                            return True  # Inconsistency found
    return False  # No inconsistencies

# Analyze each group for inconsistencies
for i, component in enumerate(connected_components):
    if len(component) >= 3 and find_inconsistencies(component, df):  # Check for at least three IDs and inconsistency
        print(f"Inconsistent Group {i+1}:")
        for id in component:
            urls = df[df['id'] == id]['OpenCorporates_url'].unique()
            print(f"  ID {id} - URLs: {', '.join(urls)}")
        print()


Inconsistent Group 3:
  ID 7705 - URLs: https://opencorporates.com/companies/de/M1201_HRB21915, https://opencorporates.com/companies/de/K1101R_HRB40187, https://opencorporates.com/companies/de/D2601V_HRB41044, https://opencorporates.com/companies/de/D2601V_HRB177657
  ID 10 - URLs: https://opencorporates.com/companies/de/K1101R_HRB4649, https://opencorporates.com/companies/de/K1101R_HRB40187
  ID 1350 - URLs: https://opencorporates.com/companies/de/K1101R_HRB4649

Inconsistent Group 39:
  ID 176 - URLs: https://opencorporates.com/companies/de/B8537_HRB540215
  ID 11489 - URLs: https://opencorporates.com/companies/de/D2601V_HRB214769, https://opencorporates.com/companies/de/B8537_HRB540215
  ID 12616 - URLs: https://opencorporates.com/companies/de/D2601V_HRB214769

Inconsistent Group 55:
  ID 5921 - URLs: https://opencorporates.com/companies/de/R3306_HRB603
  ID 5674 - URLs: https://opencorporates.com/companies/de/R3306_HRB603
  ID 2614 - URLs: https://opencorporates.com/companies/de/P2