In [108]:
import json
def load_data(filename):
    with open(filename, 'r') as f:
        data = json.load(f)
        
    return data
    

In [136]:
data = load_data("Data cleaning part//data_json.json")


In [137]:
# Function to clean the data
def clean_data(data):  
    # Defines a function named clean_data
    # It takes one argument: data (expected to be a list of dictionaries)

    text_to_number = {  
        # Dictionary to convert text ratings into numbers
        "one": 1,
        "two": 2,
        "three": 3,
        "four": 4,
        "five": 5
    }

    cleaned_data = []  
    # This list will store the final cleaned user records

    unique_users = set()  
    # A set to keep track of user names we have already seen
    # Sets automatically prevent duplicates

    for user in data:  
        # Loop through each user dictionary in the data list

        # -------- CLEAN RATING --------
        raw_rating = user["rating"]  
        # Extract the rating value from the user dictionary

        if isinstance(raw_rating, str):  
            # Check if rating is a string
            # This avoids calling string methods on numbers

            raw_rating = raw_rating.strip().lower()  
            # Remove extra spaces and convert to lowercase
            # Example: " Five " â†’ "five"

            if raw_rating in text_to_number:  
                # If rating is a word like "five"
                raw_rating = text_to_number[raw_rating]  
                # Convert word to number using dictionary lookup

            elif "." in raw_rating:  
                # If rating contains a decimal point like "3.5"
                raw_rating = float(raw_rating)  
                # Convert string to float

            else:  
                # Otherwise it must be a whole number string like "4"
                raw_rating = int(raw_rating)  
                # Convert string to integer

        user["rating"] = raw_rating  
        # Save the cleaned numeric rating back into the user dictionary

        # -------- CLEAN AGE --------
        raw_age = user.get("age")  
        # Safely get age value (returns None if key is missing)

        if raw_age == None:  
            # If age is missing or explicitly null
            user["age"] = None  
            # Keep it as None (explicitly clean)

        # -------- REMOVE DUPLICATES --------
        if user["name"].strip() in unique_users:  
            # Check if this user's name already exists in the set
            # strip() removes extra spaces from name

            continue  
            # Skip this user if duplicate found

        unique_users.add(user["name"].strip())  
        # Add cleaned name to the set to track uniqueness

        cleaned_data.append(user)  
        # Add the cleaned, unique user to the final list

    return cleaned_data  
    # Return the fully cleaned and deduplicated data


In [138]:
clean_data(data)

[{'name': 'Alice', 'rating': 5, 'feedback': 'Great product!!', 'age': '25'},
 {'name': 'Bob', 'rating': 4, 'feedback': 'ok but late Delivery', 'age': '30'},
 {'name': ' Charlie', 'rating': 2, 'feedback': 'BAD EXPERIENCE ', 'age': None},
 {'name': 'Diana', 'feedback': 'Loved it!', 'rating': 5, 'age': '28'},
 {'name': 'Eve',
  'rating': 3.5,
  'feedback': 'Average - could be better',
  'age': '20'}]