In [2]:
import json

with open ('users.json','r') as file:
    data = file.read().strip().split('\n')

data = [json.loads(line) for line in data]



In [3]:
import json
import os

def load_data(file_path):
    """Load JSON data from a file, handling non-standard JSON formats."""
    with open(file_path, 'r') as file:
        data = file.read().strip().split('\n')
    return [json.loads(line) for line in data]

def check_consistency(data):
    """Identify inconsistencies in data types across similar fields."""
    inconsistencies = []
    for i, record in enumerate(data):
        for key, value in record.items():
            if isinstance(value, str) and any(isinstance(rec.get(key, ""), (int, float)) for rec in data):
                inconsistencies.append((i, key, type(value)))
    return inconsistencies

def check_uniqueness(data):
    """Calculate the number of duplicate records."""
    unique_records = [dict(t) for t in {tuple(sorted(d.items())) for d in data}]
    duplicate_count = len(data) - len(unique_records)
    return duplicate_count

def check_completeness(data, required_fields):
    """Check for missing required fields in records."""
    missing_fields = []
    for i, record in enumerate(data):
        missing = [field for field in required_fields if field not in record]
        if missing:
            missing_fields.append((i, missing))
    return missing_fields

def analyze_dataset(file_path, required_fields):
    print(f"\nAnalyzing {os.path.basename(file_path)}...")
    data = load_data(file_path)
    
    # Consistency check
    inconsistencies = check_consistency(data)
    print(f"Found inconsistencies in data types: {inconsistencies}")
    
    # Uniqueness verification
    duplicate_count = check_uniqueness(data)
    print(f"Found {duplicate_count} duplicate records.")
    
    # Completeness assessment
    missing_fields = check_completeness(data, required_fields)
    print(f"Records missing required fields: {missing_fields}")

def main(dataset_paths):
    required_fields = {
        'users.json': ['_id', 'state', 'createdDate','lastLogin', 'role', 'active'],  
        'receipts.json': ['_id', 'binusPointsEarned','bonusPointearnedReason','createDate','dateScanned','finishedDate','modifyDate','pointsAwardedDate','pointsEarned','purchaseDate','purchasedItemCount','rewardsReceiptItemList','rewardsReceiptStatus','totalSpent','user_Id'],
        'brands.json': ['_id', 'barcode','brandCode','category','categoryCode','cpg','topBrand','name']
    }
    
    for file_path in dataset_paths:
        file_name = os.path.basename(file_path)
        analyze_dataset(file_path, required_fields.get(file_name, []))

# List of dataset paths
dataset_paths = [
    'users.json',
    'receipts.json',
    'brands.json'
]

main(dataset_paths)



Analyzing users.json...
Found inconsistencies in data types: []


TypeError: unhashable type: 'dict'