In [None]:
# Ensuring Data Consistency: Identifying and Resolving Conflicting Values

import pandas as pd
import numpy as np

# Load datasets from different sources
# Replace 'source_a.csv', 'source_b.csv', and 'source_c.csv' with your actual file paths
df_a = pd.read_csv('source_a.csv')
df_b = pd.read_csv('source_b.csv')
df_c = pd.read_csv('source_c.csv')

# Merge datasets on a common key (e.g., 'ID')
df_merged = df_a.merge(df_b, on='ID', how='outer', suffixes=('_a', '_b'))
df_merged = df_merged.merge(df_c, on='ID', how='outer')
df_merged.rename(columns={'Value': 'Value_c'}, inplace=True)

# Identify conflicting values across sources
def detect_conflict(row):
    values = [row['Value_a'], row['Value_b'], row['Value_c']]
    non_null_values = [v for v in values if pd.notnull(v)]
    return len(set(non_null_values)) > 1

df_merged['Conflict'] = df_merged.apply(detect_conflict, axis=1)

# Display rows with conflicts
conflicts = df_merged[df_merged['Conflict']]
print("Conflicting rows:")
print(conflicts)

# Resolve conflicts using a preferred source hierarchy: A > B > C
def resolve_conflict(row):
    if pd.notnull(row['Value_a']):
        return row['Value_a']
    elif pd.notnull(row['Value_b']):
        return row['Value_b']
    elif pd.notnull(row['Value_c']):
        return row['Value_c']
    else:
        return np.nan

df_merged['Resolved_Value'] = df_merged.apply(resolve_conflict, axis=1)

# Drop individual source columns to retain only the resolved value
df_cleaned = df_merged[['ID', 'Resolved_Value']]

# Save the cleaned dataset to a CSV file
df_cleaned.to_csv('cleaned_dataset.csv', index=False)

print("Data consistency ensured. Cleaned dataset saved to 'cleaned_dataset.csv'.")