<a href="https://colab.research.google.com/github/Krishnan-Raghavan/Packt/blob/main/DataCleaningAnd_PreparationChapter2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Accuracy

In [1]:
import pandas as pd

# Sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 28, 28, 22],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco']
}

# Reference dataset for accuracy comparison
reference_data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 29, 28, 22],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'San Francisco']
}

df = pd.DataFrame(data)
reference_df = pd.DataFrame(reference_data)

# Step 1: Import necessary libraries
# We import the pandas library to work with the dataset.

# Step 2: Create a sample dataset and a reference dataset
# We create a sample dataset and a reference dataset with the same structure.

# Step 3: Create DataFrames
df = pd.DataFrame(data)
reference_df = pd.DataFrame(reference_data)

# Step 4: Compare data to the reference
accuracy_check = df == reference_df

# Step 5: Calculate accuracy percentage
accuracy_percentage = accuracy_check.mean() * 100
# We calculate the accuracy percentage by taking the mean of the accuracy check for each column and multiplying by 100.

# Step 6: Display the accuracy results
print("Accuracy Check:")
print(accuracy_check)
print("\nAccuracy Percentage:")
print(accuracy_percentage)

Accuracy Check:
   Name    Age  Gender  City
0  True   True    True  True
1  True   True    True  True
2  True  False    True  True
3  True   True    True  True
4  True   True    True  True

Accuracy Percentage:
Name      100.0
Age        80.0
Gender    100.0
City      100.0
dtype: float64


Completeness

In [2]:
import pandas as pd

# Sample dataset
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, None, 28, 22],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'City': ['New York', 'Los Angeles', 'Chicago', None, 'San Francisco']
}

df = pd.DataFrame(data)

# Step 1: Import necessary libraries
# We import the pandas library to work with the dataset.

# Step 2: Create a sample dataset
# We create a simple dataset with columns 'Name', 'Age', 'Gender', and 'City'. Some values are intentionally missing (represented as 'None').

# Step 3: Create a DataFrame
df = pd.DataFrame(data)
# We create a DataFrame using the sample data.

# Step 4: Check completeness
completeness = df.isnull().sum()
# The .isnull() method checks for missing values in the DataFrame, and .sum() counts the missing values for each column.

# Step 5: Calculate completeness percentage
total_records = len(df)
completeness_percentage = (1- completeness / total_records) * 100
# We calculate the completeness percentage by dividing the count of missing values by the total number of records and then multiplying by 100.

# Step 6: Display the completeness results
print("Completeness Check:")
print(completeness)
print("\nCompleteness Percentage:")
print(completeness_percentage)

Completeness Check:
Name      0
Age       1
Gender    0
City      1
dtype: int64

Completeness Percentage:
Name      100.0
Age        80.0
Gender    100.0
City       80.0
dtype: float64


Timeliness

In [3]:
import pandas as pd
from datetime import datetime

# Sample dataset with timestamps
data = {
    'Timestamp': ['2023-10-25 10:00:00', '2023-10-25 11:00:00', '2023-10-25 12:00:00'],
    'Value': [50, 55, 60]
}

# Convert the 'Timestamp' column to datetime objects
df = pd.DataFrame(data)
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Reference timestamp (current time for this example)
reference_timestamp = datetime(2023, 10, 25, 12, 30, 0)

# Step 1: Import necessary libraries and create the dataset
# We import Pandas and the datetime module and create a sample dataset with timestamps.

# Step 2: Convert timestamps to datetime objects
# We convert the 'Timestamp' column to datetime objects to work with timestamps effectively.

# Step 3: Define the reference timestamp
# In this example, we set a reference timestamp, which represents the current time.

# Step 4: Calculate timeliness
timeliness_check = df['Timestamp'] < reference_timestamp

# Step 5: Display timeliness results
print("Timeliness Check:")
print(timeliness_check)

Timeliness Check:
0    True
1    True
2    True
Name: Timestamp, dtype: bool


Average Timeliness

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Generate a random dataset with timestamps
np.random.seed(0)  # For reproducibility
n_samples = 100
start_time = datetime(2023, 10, 25, 9, 0, 0)
end_time = datetime(2023, 10, 25, 16, 0, 0)

timestamps = [start_time + timedelta(minutes=np.random.randint(0, (end_time - start_time).total_seconds() / 60)) for _ in range(n_samples)]
values = np.random.randint(50, 101, n_samples)

df = pd.DataFrame({'Timestamp': timestamps, 'Value': values})

# Reference timestamp (current time for this example)
reference_timestamp = datetime(2023, 10, 25, 12, 0, 0)

# Define a timeliness threshold (in minutes)
timeliness_threshold = 30

# Calculate timeliness
df['Timeliness'] = (reference_timestamp - df['Timestamp']).dt.total_seconds() / 60
df['Timely'] = df['Timeliness'] <= timeliness_threshold

# Calculate the average timeliness
average_timeliness = df['Timeliness'].mean()

# Display results
print("Dataset with Timestamps:")
print(df.head())

print("\nAverage Timeliness (in minutes):", average_timeliness)
print("Percentage of Timely Records:", (df['Timely'].sum() / n_samples) * 100, "%")

Dataset with Timestamps:
            Timestamp  Value  Timeliness  Timely
0 2023-10-25 11:52:00     71         8.0    True
1 2023-10-25 09:47:00     98       133.0   False
2 2023-10-25 10:57:00     99        63.0   False
3 2023-10-25 12:12:00     55       -12.0    True
4 2023-10-25 14:23:00     91      -143.0    True

Average Timeliness (in minutes): -23.8
Percentage of Timely Records: 61.0 %


Consistency

In [5]:
import pandas as pd

# Create a sample dataset
data = {
    'ProductID': [1, 2, 3, 4, 5],
    'ProductName': ['PROD001', 'PROD002', 'Product003', 'PROD004', 'PROD005'],
}

df = pd.DataFrame(data)

# Define the expected prefix
expected_prefix = "PROD"

# Check consistency and create a boolean mask for inconsistent names
inconsistent_mask = ~df['ProductName'].str.startswith(expected_prefix)

# Create a new column to indicate consistency
df['Consistency'] = ~inconsistent_mask

# Calculate the percentage of consistent rows
consistent_percentage = (df['Consistency'].sum() / len(df)) * 100

# Display the dataset with the consistency check results
print("Dataset with Consistency Check:")
print(df)

# Display the percentage of consistent rows
print(f"Percentage of Consistent Rows: {consistent_percentage:.2f}%")

Dataset with Consistency Check:
   ProductID ProductName  Consistency
0          1     PROD001         True
1          2     PROD002         True
2          3  Product003        False
3          4     PROD004         True
4          5     PROD005         True
Percentage of Consistent Rows: 80.00%


Uniqueness

In [6]:
import pandas as pd

# Create a sample dataset
data = {
    'Email': ['john.doe@example.com', 'jane.smith@example.com', 'james.doe@example.com', 'susan.brown@example.com'],
}

df = pd.DataFrame(data)

# Check uniqueness and create a boolean mask for duplicated email addresses
duplicated_mask = df['Email'].duplicated(keep='first')

# Create a new column to indicate uniqueness
df['Uniqueness'] = ~duplicated_mask

# Calculate the percentage of unique records
unique_percentage = (df['Uniqueness'].sum() / len(df)) * 100

# Display the dataset with the uniqueness check results
print("Dataset with Uniqueness Check:")
print(df)

# Display the percentage of unique records
print(f"Percentage of Unique Records: {unique_percentage:.2f}%")

Dataset with Uniqueness Check:
                     Email  Uniqueness
0     john.doe@example.com        True
1   jane.smith@example.com        True
2    james.doe@example.com        True
3  susan.brown@example.com        True
Percentage of Unique Records: 100.00%


Data Duplication

In [7]:
import pandas as pd

# Create a sample dataset with duplicate records
data = {
    'EmployeeID': [101, 102, 103, 101, 104, 105, 102],
    'FirstName': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Bob'],
    'LastName': ['Smith', 'Johnson', 'Brown', 'Davis', 'Lee', 'White', 'Johnson'],
}

df = pd.DataFrame(data)

# Check for duplicate records based on the 'EmployeeID' column
duplicated_mask = df.duplicated(subset='EmployeeID', keep='first')

# Create a new column to indicate duplicate records
df['IsDuplicate'] = duplicated_mask

# Calculate the percentage of duplicate records
duplicate_percentage = (df['IsDuplicate'].sum() / len(df)) * 100

# Display the dataset with the duplicate records marked
print("Dataset with Duplicate Records:")
print(df)

# Display the percentage of duplicate records
print(f"Percentage of Duplicate Records: {duplicate_percentage:.2f}%")

Dataset with Duplicate Records:
   EmployeeID FirstName LastName  IsDuplicate
0         101     Alice    Smith        False
1         102       Bob  Johnson        False
2         103   Charlie    Brown        False
3         101     David    Davis         True
4         104       Eve      Lee        False
5         105     Frank    White        False
6         102       Bob  Johnson         True
Percentage of Duplicate Records: 28.57%


Data Usage

In [8]:
import random

# Simulated data usage metrics
def simulate_data_usage():
    # Simulate the number of users in the organization
    num_users = 500

    # Simulate data utilization rates for each user (percentage)
    data_utilization_rates = [random.uniform(20, 90) for _ in range(num_users)]

    # Simulate the number of data requests or queries made by each user
    data_requests = [random.randint(1, 100) for _ in range(num_users)]

    # Calculate the overall data utilization rate for the organization
    organization_data_utilization_rate = sum(data_utilization_rates) / num_users

    # Calculate the total number of data requests or queries
    total_data_requests = sum(data_requests)

    # Simulate user satisfaction surveys (on a scale of 1 to 5)
    user_satisfaction_scores = [random.randint(1, 5) for _ in range(num_users)]

    # Calculate average user satisfaction score
    avg_user_satisfaction_score = sum(user_satisfaction_scores) / num_users

    return {
        "data_utilization_rates": data_utilization_rates,
        "organization_data_utilization_rate": organization_data_utilization_rate,
        "data_requests": data_requests,
        "total_data_requests": total_data_requests,
        "user_satisfaction_scores": user_satisfaction_scores,
        "avg_user_satisfaction_score": avg_user_satisfaction_score,
    }

# Run the simulation
data_usage_metrics = simulate_data_usage()

# Display the results
print("\nOrganization Data Utilization Rate:")
print(f"{data_usage_metrics['organization_data_utilization_rate']:.2f}%")
print("\nTotal Number of Data Requests or Queries:")
print(data_usage_metrics["total_data_requests"])
print("\nAverage User Satisfaction Score:")
print(f"{data_usage_metrics['avg_user_satisfaction_score']:.2f}")


Organization Data Utilization Rate:
55.12%

Total Number of Data Requests or Queries:
24967

Average User Satisfaction Score:
2.99


Data Compliance

In [9]:
import random

# Simulate a dataset with compliance checks
def simulate_data_compliance(num_records):
    data_records = []
    compliant_count = 0  # Counter for compliant records

    for _ in range(num_records):
        # Generate a random record (e.g., containing age and consent fields)
        age = random.randint(18, 100)
        consent_given = random.choice([True, False])

        # Define compliance rules
        age_rule = age >= 18
        consent_rule = age >= 18 and consent_given

        # Check compliance with specific regulations
        age_compliant = "Age Compliant" if age_rule else "Age Non-Compliant"
        consent_compliant = "Consent Compliant" if consent_rule else "Consent Non-Compliant"

        # Define overall compliance status
        compliance_status = "Compliant" if age_rule and consent_rule else "Non-Compliant"

        # Count compliant records
        if compliance_status == "Compliant":
            compliant_count += 1

        data_records.append({
            "Age": age,
            "Consent Given": consent_given,
            "Age Compliance": age_compliant,
            "Consent Compliance": consent_compliant,
            "Overall Compliance Status": compliance_status
        })

    # Calculate the percentage of compliant records
    percentage_compliant = (compliant_count / num_records) * 100

    return data_records, percentage_compliant

# Define the number of data records to simulate
num_records = 100

# Simulate data compliance checks
data_records, percentage_compliant = simulate_data_compliance(num_records)

# Display the results for a sample of data records and the percentage of compliance
sample_size = 10
for record in data_records[:sample_size]:
    print(record)

print(f"\nPercentage of Compliant Records: {percentage_compliant:.2f}%")

{'Age': 25, 'Consent Given': False, 'Age Compliance': 'Age Compliant', 'Consent Compliance': 'Consent Non-Compliant', 'Overall Compliance Status': 'Non-Compliant'}
{'Age': 83, 'Consent Given': False, 'Age Compliance': 'Age Compliant', 'Consent Compliance': 'Consent Non-Compliant', 'Overall Compliance Status': 'Non-Compliant'}
{'Age': 93, 'Consent Given': True, 'Age Compliance': 'Age Compliant', 'Consent Compliance': 'Consent Compliant', 'Overall Compliance Status': 'Compliant'}
{'Age': 36, 'Consent Given': False, 'Age Compliance': 'Age Compliant', 'Consent Compliance': 'Consent Non-Compliant', 'Overall Compliance Status': 'Non-Compliant'}
{'Age': 87, 'Consent Given': False, 'Age Compliance': 'Age Compliant', 'Consent Compliance': 'Consent Non-Compliant', 'Overall Compliance Status': 'Non-Compliant'}
{'Age': 39, 'Consent Given': True, 'Age Compliance': 'Age Compliant', 'Consent Compliance': 'Consent Compliant', 'Overall Compliance Status': 'Compliant'}
{'Age': 80, 'Consent Given': True,