## Assignment: Check for Duplicates in a Dataset

In [12]:
import pandas as pd

def check_duplicates(df, columns):
    # Check inputs
    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input df must be a pandas DataFrame.")
    if not isinstance(columns, list):
        raise TypeError("Input columns must be a list.")
    if not all(isinstance(col, str) for col in columns):
        raise TypeError("All elements in columns must be string.")
    if not all(col in df.columns for col in columns):
        raise ValueError("All elements in columns must correspond to column names in the DataFrame.")

    try:
        # Check for duplicates
        duplicates = df.duplicated(subset=columns, keep='first')

        # Count the number of duplicate rows
        count = duplicates.sum()

        # Create a dataframe with group count of duplicate rows
        samples = df[duplicates].groupby(columns).size().reset_index(name='number_of_duplicates')

        return {'count': count, 'samples': samples}
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return None




df_1 = pd.DataFrame(
    data=[
        ['A','a', 'x', 1],
        ['A','b', 'x', 1],
        ['A','c', 'x', 1],
        ['B','a', 'x', 1],
        ['B','b', 'x', 1],
        ['B','c', 'x', 1],
        ['A','a', 'y', 1],
    ],
    columns=['col_1', 'col_2', 'col_3', 'col_4']
)

column_lists = [['col_1'], ['col_1', 'col_2'], ['col_1', 'col_2', 'col_3'], ['col_3','col_4']]

for cols in column_lists:
    print(f"Checking duplicates on columns: {cols}")
    result = check_duplicates(df_1, cols)
    print(f"Count: {result['count']}")
    print(f"Samples: \n{result['samples']}\n")

Checking duplicates on columns: ['col_1']
Count: 5
Samples: 
  col_1  number_of_duplicates
0     A                     3
1     B                     2

Checking duplicates on columns: ['col_1', 'col_2']
Count: 1
Samples: 
  col_1 col_2  number_of_duplicates
0     A     a                     1

Checking duplicates on columns: ['col_1', 'col_2', 'col_3']
Count: 0
Samples: 
Empty DataFrame
Columns: [col_1, col_2, col_3, number_of_duplicates]
Index: []

Checking duplicates on columns: ['col_3', 'col_4']
Count: 5
Samples: 
  col_3  col_4  number_of_duplicates
0     x      1                     5



### Documentation

    check_duplicates(df, columns)

    Purpose:

This function checks a pandas DataFrame for duplicate rows based on specified columns.

    Input:

df: pandas DataFrame.

columns: List of column names as strings.

    Output:

A dictionary with count (number of duplicated rows) and samples (DataFrame of unique duplicate rows with counts).

    Notes:

- Assumption is that the dataset is pandas DataFrame.
- Function checks that df is a DataFrame and columns is a list of strings.
- If a column does not exist, a KeyError will occur.
- If no duplicates are found, count is 0 and samples is an empty DataFrame.
- Errors during execution will print an error message and return None.