In [11]:
# with function
import json

def check_duplicate_ids(file_path):
    try:
        data = json.load(open(file_path, 'r'))
    except FileNotFoundError:
        print(f"File not found at path: {file_path}")
        return

    total_id_count = 0
    unique_ids = set()
    duplicate_ids = set()

    for item in data:
        item_id = item.get("id")
        if item_id:
            if item_id in unique_ids:
                duplicate_ids.add(item_id)
            else:
                unique_ids.add(item_id)
            total_id_count += item_id

    total_unique_ids = len(unique_ids)
    total_duplicate_ids = len(duplicate_ids)

    print(f"Total unique 'id' values: {total_unique_ids}")
    print(f"Total duplicate 'id' values: {total_duplicate_ids}")
    print(f"Total 'id' count in key 'id': {total_id_count}")

    if duplicate_ids:
        print("Duplicate 'id' values found:")
        for id in duplicate_ids:
            print(id)

# Example usage:
file_path = '/content/user_data.json'
check_duplicate_ids(file_path)

Total unique 'id' values: 1000
Total duplicate 'id' values: 0
Total 'id' count in key 'id': 500500


In [12]:
import json

def remove_duplicate_ids_and_ip_addresses(input_file_path, output_file_path):
    # Load the JSON data from the input file
    with open(input_file_path, "r") as json_file:
        data = json.load(json_file)

    # Create sets to store unique "id" and "ip_address" values
    unique_ids = set()
    unique_ip_addresses = set()
    duplicates_removed_data = []

    # Iterate through the JSON data and check for duplicate "id" and "ip_address" values
    for item in data:
        item_id = item.get("id")
        item_ip_address = item.get("ip_address")

        if item_id:
            if item_id not in unique_ids:
                unique_ids.add(item_id)
            else:
                continue  # Skip items with duplicate "id" values

        if item_ip_address:
            if item_ip_address not in unique_ip_addresses:
                unique_ip_addresses.add(item_ip_address)
            else:
                continue  # Skip items with duplicate "ip_address" values

        # If both "id" and "ip_address" are unique, add the item to the new data list
        duplicates_removed_data.append(item)

    # Write the new JSON data with duplicates removed to the output file
    with open(output_file_path, "w") as new_json_file:
        json.dump(duplicates_removed_data, new_json_file, indent=4)

    return output_file_path

# Usage example
input_file_path = "/content/user_data.json"
output_file_path = "/content/filtered_data.json"
new_file_path = remove_duplicate_ids_and_ip_addresses(input_file_path, output_file_path)
print(f"Duplicate 'id' and 'ip_address' values removed. New JSON file saved as '{new_file_path}'")


Duplicate 'id' and 'ip_address' values removed. New JSON file saved as '/content/filtered_data.json'


In [8]:
data

[{'id': 1,
  'first_name': 'Estrella',
  'last_name': 'Lefley',
  'email': 'elefley0@ifeng.com',
  'gender': 'Female',
  'ip_address': '203.78.236.252'},
 {'id': 2,
  'first_name': 'Jeth',
  'last_name': 'Dukesbury',
  'email': 'jdukesbury1@printfriendly.com',
  'gender': 'Male',
  'ip_address': '248.192.4.225'},
 {'id': 3,
  'first_name': 'Gareth',
  'last_name': 'Pluck',
  'email': 'gpluck2@fema.gov',
  'gender': 'Male',
  'ip_address': '105.105.231.37'},
 {'id': 4,
  'first_name': 'Donnajean',
  'last_name': 'Lasslett',
  'email': 'dlasslett3@scientificamerican.com',
  'gender': 'Female',
  'ip_address': '93.199.93.188'},
 {'id': 5,
  'first_name': 'Tremaine',
  'last_name': 'Poxon',
  'email': 'tpoxon4@mozilla.org',
  'gender': 'Male',
  'ip_address': '89.141.203.82'},
 {'id': 6,
  'first_name': 'Fons',
  'last_name': 'Klosa',
  'email': 'fklosa5@home.pl',
  'gender': 'Male',
  'ip_address': '109.103.187.211'},
 {'id': 7,
  'first_name': 'Irina',
  'last_name': 'Howitt',
  'email':

In [13]:
#data filtering on csv

import pandas as pd

def remove_duplicates_from_csv(input_csv_path, output_csv_path, column_names):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv_path)

    # Remove duplicates based on the specified column names
    df_no_duplicates = df.drop_duplicates(subset=column_names)

    # Write the DataFrame with duplicates removed to a new CSV file
    df_no_duplicates.to_csv(output_csv_path, index=False)

    return output_csv_path

# Specify the input and output CSV file paths and column names
input_csv_path = "/content/user_data_info.csv"
output_csv_path = "/content/filtered_csv_file.csv"
column_names = ["id", "first_name", "last_name", "email", "gender", "ip_address"]

# Call the function to remove duplicates
new_csv_file_path = remove_duplicates_from_csv(input_csv_path, output_csv_path, column_names)

print(f"Duplicate rows removed. New CSV file saved as '{new_csv_file_path}'")


Duplicate rows removed. New CSV file saved as '/content/filtered_csv_file.csv'
